ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • [머신러닝] 회귀(Regression)
    Studies/Data Analytics&ML 2023. 1. 23. 14:32

     01 데이터셋 불러오기

    # 기본 라이브러리
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # skleran 데이터셋에서 보스턴 주택 데이터셋 로딩
    from sklearn import datasets
    housing = datasets.load_boston()
    
    # 딕셔너리 형태이므로, key 값을 확인
    housing.keys()
    
    # 판다스 데이터프레임으로 변환
    data = pd.DataFrame(housing['data'], columns=housing['feature_names'])
    target = pd.DataFrame(housing['target'], columns=['Target'])
    # 데이터셋 크기
    print(data.shape)
    print(target.shape)
    print(data.head())
    print(housing['feature_names'])
    print(target.head())
    
    # 데이터프레임 결합하기
    df = pd.concat([data, target], axis=1)
    df.head(2)

     

    housing.keys()

     

    print(housing['feature_names']) / print(target.head())

     

    df = pd.concat([data, target], axis=1)

     

     

     02 데이터 탐색 (EDA)

    # 상관계수 행렬
    df_corr = df.corr()
    
    # 히트맵 그리기
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=0.8)
    sns.heatmap(df_corr, annot=True, cbar=False);
    plt.show()
    
    # 변수 간의 상관관계 분석 - Target 변수와 상관관계가 높은 순서대로 정리
    corr_order = df_corr.loc[:'LSTAT', 'Target'].abs().sort_values(ascending=False)
    corr_order
    
    # Target 변수와 상관관계가 높은 4개 변수를 추출
    plot_cols = ['Target', 'LSTAT', 'RM', 'PTRATIO', 'INDUS']
    plot_df = df.loc[:, plot_cols]
    plot_df.head()
    
    # regplot으로 선형회귀선 표시
    plt.figure(figsize=(10,10))
    print(plot_cols[1:])
    for idx, col in enumerate(plot_cols[1:]):
        print('idx=',idx,'col=',col)
        ax1 = plt.subplot(2, 2, idx+1)
        #subplot은 축 공유
        sns.regplot(x=col, y=plot_cols[0], data=plot_df, ax=ax1)  
        #regplot은 scatter와 line을 함께 볼 수 있는 데이터 시각화 방법   
    plt.show()

    regplot

     


     

    # Target 데이터의 분포
    sns.displot( x='Target', kind='hist', data=df)
    plt.show()

    Target의 데이터 분포

     

    03 데이터 전처리

     

    피처 스케일링

    # 사이킷런 MinMaxScaler 적용 
    from sklearn.preprocessing import MinMaxScaler
    scaler=MinMaxScaler()
    
    df_scaled = df.iloc[:, :-1]
    scaler.fit(df_scaled)
    df_scaled = scaler.transform(df_scaled)
    
    # 스케일링 변환된 값을 데이터프레임에 반영
    df.iloc[:, :-1] = df_scaled[:, :]
    df.head()

    마지막 열인 'Targert' 만 제외하고 값을 0-1로 스케일링하는 MinMaxScaler 적용

     

    df.head()


    학습용-테스트 데이터셋 분리하기

    # 학습 - 테스트 데이터셋 분할
    from sklearn.model_selection import train_test_split
    X_data = df.loc[:, ['LSTAT', 'RM']]
    y_data = df.loc[:, 'Target']
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                                        test_size=0.2, 
                                                        shuffle=True, 
                                                        random_state=12)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)

     

     

    04 Baseline 모델 - 선형 회귀

    # 선형 회귀 모형
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    print ("회귀계수(기울기): ", np.round(lr.coef_, 1))
    print ("상수항(절편): ", np.round(lr.intercept_, 1))
    
    
    # 예측
    y_test_pred = lr.predict(X_test)
    
    # 예측값, 실제값의 분포
    plt.figure(figsize=(10, 5))
    plt.scatter(X_test['LSTAT'], y_test, label='y_test')  
    plt.scatter(X_test['LSTAT'], y_test_pred, c='r', label='y_pred')
    plt.legend(loc='best')
    plt.show()
    
    
    # 평가
    from sklearn.metrics import mean_squared_error
    y_train_pred = lr.predict(X_train)
    
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)
    
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)

     

    lr.coef_, lr.intercept_

     

    예측값 , 실제값의 분포
    평가

     05 교차 검증

    # cross_val_score 함수
    from sklearn.model_selection import cross_val_score
    lr = LinearRegression()
    mse_scores = -1*cross_val_score(lr, X_train, y_train, cv=5,
                                    scoring='neg_mean_squared_error')
    print("개별 Fold의 MSE: ", np.round(mse_scores, 4))
    print("평균 MSE: %.4f" % np.mean(mse_scores))

     

     

     06 L1/L2 규제

    # 2차 다항식 변환
    from sklearn.preprocessing import PolynomialFeatures
    pf = PolynomialFeatures(degree=2)
    X_train_poly = pf.fit_transform(X_train)
    print("원본 학습 데이터셋: ", X_train.shape)
    print("2차 다항식 변환 데이터셋: ", X_train_poly.shape)

     

    # 15차 다항식 변환 데이터셋으로 선형 회귀 모형 학습
    pf = PolynomialFeatures(degree=15)
    X_train_poly = pf.fit_transform(X_train)
    
    lr = LinearRegression()
    lr.fit(X_train_poly, y_train)
    
    # 테스트 데이터에 대한 예측 및 평가
    y_train_pred = lr.predict(X_train_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)
    
    X_test_poly = pf.fit_transform(X_test)
    y_test_pred = lr.predict(X_test_poly)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)
    # 15차 다항식 변환 데이터셋으로 선형 회귀 모형 학습
    pf = PolynomialFeatures(degree=15)
    X_train_poly = pf.fit_transform(X_train)
    
    lr = LinearRegression()
    lr.fit(X_train_poly, y_train)
    
    # 테스트 데이터에 대한 예측 및 평가
    y_train_pred = lr.predict(X_train_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)
    
    X_test_poly = pf.fit_transform(X_test)
    y_test_pred = lr.predict(X_test_poly)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)

     

     

    # 다항식 차수에 따른 모델 적합도 변화
    plt.figure(figsize=(15,5))
    for n, deg in enumerate([1, 2, 15]):
        ax1 = plt.subplot(1, 3, n+1)
        # degree별 다항 회귀 모형 적용
        pf = PolynomialFeatures(degree=deg)
        X_train_poly = pf.fit_transform(X_train.loc[:, ['LSTAT']])
        X_test_poly = pf.fit_transform(X_test.loc[:, ['LSTAT']])
        lr = LinearRegression()
        lr.fit(X_train_poly, y_train)
        y_test_pred = lr.predict(X_test_poly)
        # 실제값 분포
        plt.scatter(X_test.loc[:, ['LSTAT']], y_test, label='Targets') 
        # 예측값 분포
        plt.scatter(X_test.loc[:, ['LSTAT']], y_test_pred, label='Predictions') 
        # 제목 표시
        plt.title("Degree %d" % deg)
        # 범례 표시
        plt.legend()  
    plt.show()

    다항식 차수에 따른 모델 적합도 변화

     

    선형 회귀 모델 

    # Ridge (L2 규제)
    from sklearn.linear_model import Ridge
    rdg = Ridge(alpha=2.5)
    rdg.fit(X_train_poly, y_train)
    
    y_train_pred = rdg.predict(X_train_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)
    y_test_pred = rdg.predict(X_test_poly)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)
    
    # Lasso (L1 규제)
    from sklearn.linear_model import Lasso
    las = Lasso(alpha=0.05)
    las.fit(X_train_poly, y_train)
    
    y_train_pred = las.predict(X_train_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)
    y_test_pred = las.predict(X_test_poly)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)
    
    # ElasticNet (L2/L1 규제)
    from sklearn.linear_model import ElasticNet
    ela = ElasticNet(alpha=0.01, l1_ratio=0.7)
    ela.fit(X_train_poly, y_train)
    
    y_train_pred = ela.predict(X_train_poly)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)
    y_test_pred = ela.predict(X_test_poly)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)

     

    07 트리 기반 모델 - 비선형 회귀

    # 의사결정 나무
    from sklearn.tree import DecisionTreeRegressor
    dtr = DecisionTreeRegressor(max_depth=3, random_state=12)
    dtr.fit(X_train, y_train)
    
    y_train_pred = dtr.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)  
    
    y_test_pred = dtr.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)
    
    # 랜덤 포레스트
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(max_depth=3, random_state=12)
    rfr.fit(X_train, y_train)
    
    y_train_pred = rfr.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)  
    
    y_test_pred = rfr.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)
    
    # XGBoost
    from xgboost import XGBRegressor
    xgbr = XGBRegressor(objective='reg:squarederror', max_depth=3, random_state=12)
    xgbr.fit(X_train, y_train)
    
    y_train_pred = xgbr.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    print("Train MSE: %.4f" % train_mse)  
    
    y_test_pred = xgbr.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    print("Test MSE: %.4f" % test_mse)

    댓글

Designed by Tistory.