22-T67/catboost.ipynb at master

19 KiB

Raw Permalink Blame History

In [1]:

import catboost
import pandas as pd

In [2]:

from catboost import CatBoostRegressor

In [3]:

data = pd.read_csv('./data/train_data_mod.csv')
data.head()

Out[3]:

	PM2.5	PM10	SO2	NO2	O3	O3_8h	CO	wd	ws	air_temp	...	PM2.5_transportation	PM2.5_resdient	PM2.5_power	pre_PM2.5	pre_PM10	pre_SO2	pre_NO2	pre_O3	pre_O3_8h	pre_CO
0	4.744932	5.176150	5.723585	3.663562	2.197225	2.302585	1.515127	58.0	0.7	-11.1	...	0.081248	0.827110	0.418587	136.0	214.0	317.0	38.0	8.0	9.0	3.71
1	4.584967	5.043425	5.726848	3.637586	2.079442	2.197225	1.506297	185.0	0.5	-11.7	...	0.088313	0.827110	0.412773	114.0	176.0	305.0	38.0	8.0	9.0	3.55
2	4.477337	4.955827	5.758902	3.663562	2.079442	2.197225	1.515127	0.0	0.2	-12.7	...	0.091256	0.827110	0.424400	97.0	154.0	306.0	37.0	7.0	8.0	3.51
3	4.454347	4.941642	5.680173	3.637586	2.079442	2.197225	1.530395	199.0	1.4	-10.9	...	0.092434	1.746121	0.459282	87.0	141.0	316.0	38.0	7.0	8.0	3.55
4	4.672829	5.123964	5.758902	3.637586	2.197225	2.197225	1.605430	359.0	1.2	-12.3	...	0.170738	3.446292	0.514513	85.0	139.0	292.0	37.0	7.0	8.0	3.62

5 rows × 49 columns

In [14]:

feature_cols = data.columns[7:]
out_cols = data.columns[:7]
out_cols

Out[14]:

Index(['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'O3_8h', 'CO'], dtype='object')

In [15]:

from sklearn.model_selection import train_test_split

In [16]:

train_X, test_X, train_y, test_y = train_test_split(data[feature_cols], data[out_cols], test_size=0.2,
                                                    random_state=42)
#准备参数
other_params = {'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

params_gbm = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'l1',  # 目标函数
    'metric': 'rmse',  # 评估函数
    'max_depth': 10,
    'num_leaves': 20,  # 叶子节点数
    'learning_rate': 0.09,  # 学习速率
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.9,  # 建树的样本采样比例
    'bagging_freq': 10,  # k 意味着每 k 次迭代执行bagging
    'verbose': -1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

In [17]:

import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt

In [18]:

base_model = lgb.LGBMRegressor(**params_gbm)

In [24]:

base_cat = CatBoostRegressor(iterations=1000, learning_rate=0.0005, depth=10, loss_function='RMSE', eval_metric='RMSE', random_seed=99, od_type='Iter', od_wait=50, verbose=0)

In [25]:

multioutputregressor = MultiOutputRegressor(base_cat).fit(train_X, train_y)

In [26]:

rst = multioutputregressor.predict(test_X)

In [27]:

out_results = pd.DataFrame(rst, columns=out_cols)
out_results

Out[27]:

	PM2.5	PM10	SO2	NO2	O3	O3_8h	CO
0	3.653185	4.700200	2.722381	3.261589	3.836444	3.857181	0.615219
1	4.323887	4.923196	3.198502	4.016752	3.166474	3.591639	0.824886
2	3.660165	4.662362	3.136948	3.513742	3.763910	3.671770	0.631061
3	3.728112	4.645958	3.514411	3.718547	3.199907	3.291750	0.862777
4	4.189668	4.743439	3.445615	3.674801	3.949052	3.695285	0.797655
...	...	...	...	...	...	...	...
8902	4.283530	4.995899	4.019444	3.961054	3.663294	3.314231	0.916339
8903	3.674866	4.606504	3.470283	3.307148	3.608739	3.626463	0.860751
8904	3.704409	4.350563	3.757374	3.636318	3.601366	3.539929	0.862651
8905	3.724967	4.673218	3.218182	3.765976	3.386151	2.954136	0.730686
8906	3.419836	4.188060	3.019416	3.307404	3.861704	3.746484	0.702885

8907 rows × 7 columns

In [28]:

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import numpy as np

In [29]:

for col in out_cols:
    MSE = mean_squared_error(out_results[col].values, test_y[col].values)
    RMSE = np.sqrt(MSE)
    MAE = mean_absolute_error(out_results[col].values, test_y[col].values)
    MAPE = mean_absolute_percentage_error(out_results[col].values, test_y[col].values)
    R_2 = r2_score(out_results[col].values, test_y[col].values)
    print(f"COL: {col}, MSE: {format(MSE, '.2E')}", end=',')
    print(f'RMSE: {round(RMSE, 4)}', end=',')
    print(f'MAPE: {round(MAPE, 4) * 100} %', end=',')
    print(f'MAE: {round(MAE, 4)}', end=',')
    print(f'R_2: {round(R_2, 4)}')

COL: PM2.5, MSE: 2.30E-01,RMSE: 0.4799,MAPE: 9.99 %,MAE: 0.3802,R_2: -2.1369
COL: PM10, MSE: 1.64E-01,RMSE: 0.4053,MAPE: 6.97 %,MAE: 0.3164,R_2: -2.1513
COL: SO2, MSE: 4.32E-01,RMSE: 0.6574,MAPE: 16.439999999999998 %,MAE: 0.5326,R_2: -2.0811
COL: NO2, MSE: 1.48E-01,RMSE: 0.3843,MAPE: 8.52 %,MAE: 0.3095,R_2: -2.3884
COL: O3, MSE: 4.99E-01,RMSE: 0.7061,MAPE: 17.419999999999998 %,MAE: 0.5898,R_2: -2.0369
COL: O3_8h, MSE: 4.19E-01,RMSE: 0.6471,MAPE: 15.73 %,MAE: 0.5331,R_2: -1.936
COL: CO, MSE: 3.39E-02,RMSE: 0.1842,MAPE: 18.75 %,MAE: 0.1439,R_2: -2.1239

In [ ]:

19 KiB Raw Permalink Blame History Unescape Escape

19 KiB

Raw Permalink Blame History