19 KiB
19 KiB
In [1]:
import catboost import pandas as pd
In [2]:
from catboost import CatBoostRegressor
In [3]:
data = pd.read_csv('./data/train_data_mod.csv') data.head()
Out[3]:
PM2.5 | PM10 | SO2 | NO2 | O3 | O3_8h | CO | wd | ws | air_temp | ... | PM2.5_transportation | PM2.5_resdient | PM2.5_power | pre_PM2.5 | pre_PM10 | pre_SO2 | pre_NO2 | pre_O3 | pre_O3_8h | pre_CO | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.744932 | 5.176150 | 5.723585 | 3.663562 | 2.197225 | 2.302585 | 1.515127 | 58.0 | 0.7 | -11.1 | ... | 0.081248 | 0.827110 | 0.418587 | 136.0 | 214.0 | 317.0 | 38.0 | 8.0 | 9.0 | 3.71 |
1 | 4.584967 | 5.043425 | 5.726848 | 3.637586 | 2.079442 | 2.197225 | 1.506297 | 185.0 | 0.5 | -11.7 | ... | 0.088313 | 0.827110 | 0.412773 | 114.0 | 176.0 | 305.0 | 38.0 | 8.0 | 9.0 | 3.55 |
2 | 4.477337 | 4.955827 | 5.758902 | 3.663562 | 2.079442 | 2.197225 | 1.515127 | 0.0 | 0.2 | -12.7 | ... | 0.091256 | 0.827110 | 0.424400 | 97.0 | 154.0 | 306.0 | 37.0 | 7.0 | 8.0 | 3.51 |
3 | 4.454347 | 4.941642 | 5.680173 | 3.637586 | 2.079442 | 2.197225 | 1.530395 | 199.0 | 1.4 | -10.9 | ... | 0.092434 | 1.746121 | 0.459282 | 87.0 | 141.0 | 316.0 | 38.0 | 7.0 | 8.0 | 3.55 |
4 | 4.672829 | 5.123964 | 5.758902 | 3.637586 | 2.197225 | 2.197225 | 1.605430 | 359.0 | 1.2 | -12.3 | ... | 0.170738 | 3.446292 | 0.514513 | 85.0 | 139.0 | 292.0 | 37.0 | 7.0 | 8.0 | 3.62 |
5 rows × 49 columns
In [14]:
feature_cols = data.columns[7:] out_cols = data.columns[:7] out_cols
Out[14]:
Index(['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'O3_8h', 'CO'], dtype='object')
In [15]:
from sklearn.model_selection import train_test_split
In [16]:
train_X, test_X, train_y, test_y = train_test_split(data[feature_cols], data[out_cols], test_size=0.2, random_state=42) #准备参数 other_params = {'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1} params_gbm = { 'task': 'train', 'boosting_type': 'gbdt', # 设置提升类型 'objective': 'l1', # 目标函数 'metric': 'rmse', # 评估函数 'max_depth': 10, 'num_leaves': 20, # 叶子节点数 'learning_rate': 0.09, # 学习速率 'feature_fraction': 0.9, # 建树的特征选择比例 'bagging_fraction': 0.9, # 建树的样本采样比例 'bagging_freq': 10, # k 意味着每 k 次迭代执行bagging 'verbose': -1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息 }
In [17]:
import lightgbm as lgb from sklearn.multioutput import MultiOutputRegressor import matplotlib.pyplot as plt
In [18]:
base_model = lgb.LGBMRegressor(**params_gbm)
In [24]:
base_cat = CatBoostRegressor(iterations=1000, learning_rate=0.0005, depth=10, loss_function='RMSE', eval_metric='RMSE', random_seed=99, od_type='Iter', od_wait=50, verbose=0)
In [25]:
multioutputregressor = MultiOutputRegressor(base_cat).fit(train_X, train_y)
In [26]:
rst = multioutputregressor.predict(test_X)
In [27]:
out_results = pd.DataFrame(rst, columns=out_cols) out_results
Out[27]:
PM2.5 | PM10 | SO2 | NO2 | O3 | O3_8h | CO | |
---|---|---|---|---|---|---|---|
0 | 3.653185 | 4.700200 | 2.722381 | 3.261589 | 3.836444 | 3.857181 | 0.615219 |
1 | 4.323887 | 4.923196 | 3.198502 | 4.016752 | 3.166474 | 3.591639 | 0.824886 |
2 | 3.660165 | 4.662362 | 3.136948 | 3.513742 | 3.763910 | 3.671770 | 0.631061 |
3 | 3.728112 | 4.645958 | 3.514411 | 3.718547 | 3.199907 | 3.291750 | 0.862777 |
4 | 4.189668 | 4.743439 | 3.445615 | 3.674801 | 3.949052 | 3.695285 | 0.797655 |
... | ... | ... | ... | ... | ... | ... | ... |
8902 | 4.283530 | 4.995899 | 4.019444 | 3.961054 | 3.663294 | 3.314231 | 0.916339 |
8903 | 3.674866 | 4.606504 | 3.470283 | 3.307148 | 3.608739 | 3.626463 | 0.860751 |
8904 | 3.704409 | 4.350563 | 3.757374 | 3.636318 | 3.601366 | 3.539929 | 0.862651 |
8905 | 3.724967 | 4.673218 | 3.218182 | 3.765976 | 3.386151 | 2.954136 | 0.730686 |
8906 | 3.419836 | 4.188060 | 3.019416 | 3.307404 | 3.861704 | 3.746484 | 0.702885 |
8907 rows × 7 columns
In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error import numpy as np
In [29]:
for col in out_cols: MSE = mean_squared_error(out_results[col].values, test_y[col].values) RMSE = np.sqrt(MSE) MAE = mean_absolute_error(out_results[col].values, test_y[col].values) MAPE = mean_absolute_percentage_error(out_results[col].values, test_y[col].values) R_2 = r2_score(out_results[col].values, test_y[col].values) print(f"COL: {col}, MSE: {format(MSE, '.2E')}", end=',') print(f'RMSE: {round(RMSE, 4)}', end=',') print(f'MAPE: {round(MAPE, 4) * 100} %', end=',') print(f'MAE: {round(MAE, 4)}', end=',') print(f'R_2: {round(R_2, 4)}')
COL: PM2.5, MSE: 2.30E-01,RMSE: 0.4799,MAPE: 9.99 %,MAE: 0.3802,R_2: -2.1369 COL: PM10, MSE: 1.64E-01,RMSE: 0.4053,MAPE: 6.97 %,MAE: 0.3164,R_2: -2.1513 COL: SO2, MSE: 4.32E-01,RMSE: 0.6574,MAPE: 16.439999999999998 %,MAE: 0.5326,R_2: -2.0811 COL: NO2, MSE: 1.48E-01,RMSE: 0.3843,MAPE: 8.52 %,MAE: 0.3095,R_2: -2.3884 COL: O3, MSE: 4.99E-01,RMSE: 0.7061,MAPE: 17.419999999999998 %,MAE: 0.5898,R_2: -2.0369 COL: O3_8h, MSE: 4.19E-01,RMSE: 0.6471,MAPE: 15.73 %,MAE: 0.5331,R_2: -1.936 COL: CO, MSE: 3.39E-02,RMSE: 0.1842,MAPE: 18.75 %,MAE: 0.1439,R_2: -2.1239
In [ ]: