22-T67/catboost.ipynb

19 KiB
Raw Permalink Blame History

In [1]:
import catboost
import pandas as pd
In [2]:
from catboost import CatBoostRegressor
In [3]:
data = pd.read_csv('./data/train_data_mod.csv')
data.head()
Out[3]:
PM2.5 PM10 SO2 NO2 O3 O3_8h CO wd ws air_temp ... PM2.5_transportation PM2.5_resdient PM2.5_power pre_PM2.5 pre_PM10 pre_SO2 pre_NO2 pre_O3 pre_O3_8h pre_CO
0 4.744932 5.176150 5.723585 3.663562 2.197225 2.302585 1.515127 58.0 0.7 -11.1 ... 0.081248 0.827110 0.418587 136.0 214.0 317.0 38.0 8.0 9.0 3.71
1 4.584967 5.043425 5.726848 3.637586 2.079442 2.197225 1.506297 185.0 0.5 -11.7 ... 0.088313 0.827110 0.412773 114.0 176.0 305.0 38.0 8.0 9.0 3.55
2 4.477337 4.955827 5.758902 3.663562 2.079442 2.197225 1.515127 0.0 0.2 -12.7 ... 0.091256 0.827110 0.424400 97.0 154.0 306.0 37.0 7.0 8.0 3.51
3 4.454347 4.941642 5.680173 3.637586 2.079442 2.197225 1.530395 199.0 1.4 -10.9 ... 0.092434 1.746121 0.459282 87.0 141.0 316.0 38.0 7.0 8.0 3.55
4 4.672829 5.123964 5.758902 3.637586 2.197225 2.197225 1.605430 359.0 1.2 -12.3 ... 0.170738 3.446292 0.514513 85.0 139.0 292.0 37.0 7.0 8.0 3.62

5 rows × 49 columns

In [14]:
feature_cols = data.columns[7:]
out_cols = data.columns[:7]
out_cols
Out[14]:
Index(['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'O3_8h', 'CO'], dtype='object')
In [15]:
from sklearn.model_selection import train_test_split
In [16]:
train_X, test_X, train_y, test_y = train_test_split(data[feature_cols], data[out_cols], test_size=0.2,
                                                    random_state=42)
#准备参数
other_params = {'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

params_gbm = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'l1',  # 目标函数
    'metric': 'rmse',  # 评估函数
    'max_depth': 10,
    'num_leaves': 20,  # 叶子节点数
    'learning_rate': 0.09,  # 学习速率
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.9,  # 建树的样本采样比例
    'bagging_freq': 10,  # k 意味着每 k 次迭代执行bagging
    'verbose': -1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
In [17]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
In [18]:
base_model = lgb.LGBMRegressor(**params_gbm)
In [24]:
base_cat = CatBoostRegressor(iterations=1000, learning_rate=0.0005, depth=10, loss_function='RMSE', eval_metric='RMSE', random_seed=99, od_type='Iter', od_wait=50, verbose=0)
In [25]:
multioutputregressor = MultiOutputRegressor(base_cat).fit(train_X, train_y)
In [26]:
rst = multioutputregressor.predict(test_X)
In [27]:
out_results = pd.DataFrame(rst, columns=out_cols)
out_results
Out[27]:
PM2.5 PM10 SO2 NO2 O3 O3_8h CO
0 3.653185 4.700200 2.722381 3.261589 3.836444 3.857181 0.615219
1 4.323887 4.923196 3.198502 4.016752 3.166474 3.591639 0.824886
2 3.660165 4.662362 3.136948 3.513742 3.763910 3.671770 0.631061
3 3.728112 4.645958 3.514411 3.718547 3.199907 3.291750 0.862777
4 4.189668 4.743439 3.445615 3.674801 3.949052 3.695285 0.797655
... ... ... ... ... ... ... ...
8902 4.283530 4.995899 4.019444 3.961054 3.663294 3.314231 0.916339
8903 3.674866 4.606504 3.470283 3.307148 3.608739 3.626463 0.860751
8904 3.704409 4.350563 3.757374 3.636318 3.601366 3.539929 0.862651
8905 3.724967 4.673218 3.218182 3.765976 3.386151 2.954136 0.730686
8906 3.419836 4.188060 3.019416 3.307404 3.861704 3.746484 0.702885

8907 rows × 7 columns

In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import numpy as np
In [29]:
for col in out_cols:
    MSE = mean_squared_error(out_results[col].values, test_y[col].values)
    RMSE = np.sqrt(MSE)
    MAE = mean_absolute_error(out_results[col].values, test_y[col].values)
    MAPE = mean_absolute_percentage_error(out_results[col].values, test_y[col].values)
    R_2 = r2_score(out_results[col].values, test_y[col].values)
    print(f"COL: {col}, MSE: {format(MSE, '.2E')}", end=',')
    print(f'RMSE: {round(RMSE, 4)}', end=',')
    print(f'MAPE: {round(MAPE, 4) * 100} %', end=',')
    print(f'MAE: {round(MAE, 4)}', end=',')
    print(f'R_2: {round(R_2, 4)}')
COL: PM2.5, MSE: 2.30E-01,RMSE: 0.4799,MAPE: 9.99 %,MAE: 0.3802,R_2: -2.1369
COL: PM10, MSE: 1.64E-01,RMSE: 0.4053,MAPE: 6.97 %,MAE: 0.3164,R_2: -2.1513
COL: SO2, MSE: 4.32E-01,RMSE: 0.6574,MAPE: 16.439999999999998 %,MAE: 0.5326,R_2: -2.0811
COL: NO2, MSE: 1.48E-01,RMSE: 0.3843,MAPE: 8.52 %,MAE: 0.3095,R_2: -2.3884
COL: O3, MSE: 4.99E-01,RMSE: 0.7061,MAPE: 17.419999999999998 %,MAE: 0.5898,R_2: -2.0369
COL: O3_8h, MSE: 4.19E-01,RMSE: 0.6471,MAPE: 15.73 %,MAE: 0.5331,R_2: -1.936
COL: CO, MSE: 3.39E-02,RMSE: 0.1842,MAPE: 18.75 %,MAE: 0.1439,R_2: -2.1239
In [ ]: