407 KiB
407 KiB
In [1]:
import pandas as pd
In [2]:
data = pd.read_excel('./data/20240123/煤炭数据.xlsx', header=[1]) data.head()
Out[2]:
灰分(d) | 挥发分(daf) | 活化剂种类 | 活化剂比例 | 混合方式 | 活化温度 | 活化时间 | 升温速率 | 比表面积 | 总孔体积 | 微孔体积 | Unnamed: 11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11.25 | 17.06 | KOH | 3.0 | 研磨 | 800 | 1.0 | 5.0 | 2784.0 | 1.0830 | 0.853 | 刘宇昊\n煤基活性炭的制备及其电化学性能研究 学位论文 |
1 | 8.53 | 13.46 | KOH | 3.0 | 研磨 | 800 | 1.0 | 5.0 | 2934.0 | 1.2290 | 1.074 | NaN |
2 | 18.08 | 13.85 | KOH | 3.0 | 研磨 | 800 | 1.0 | 5.0 | 3059.0 | 1.3044 | 1.011 | NaN |
3 | 11.42 | 12.31 | KOH | 3.0 | 研磨 | 800 | 1.0 | 5.0 | 2365.0 | 0.8030 | 0.605 | NaN |
4 | 11.60 | 8.49 | KOH | 3.0 | 研磨 | 800 | 1.0 | 5.0 | 2988.0 | 1.2820 | 0.944 | NaN |
In [3]:
data.drop(columns=data.columns[-1], inplace=True)
In [4]:
data.columns
Out[4]:
Index(['灰分(d)', '挥发分(daf)', '活化剂种类', '活化剂比例', '混合方式', '活化温度', '活化时间', '升温速率',
'比表面积', '总孔体积', '微孔体积'],
dtype='object')
In [5]:
object_cols = ['活化剂种类', '混合方式']
In [6]:
data = pd.get_dummies(data, columns=object_cols)
In [7]:
out_cols = ['比表面积', '总孔体积', '微孔体积'] feature_cols = [x for x in data.columns if x not in out_cols]
In [8]:
train_data = data.reset_index(drop=True)
In [9]:
train_data.shape
Out[9]:
(174, 12)
In [10]:
import xgboost as xgb from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
In [11]:
from sklearn.model_selection import KFold, train_test_split kf = KFold(n_splits=6, shuffle=True, random_state=42)
In [12]:
import numpy as np
In [18]:
params_xgb = {"objective": 'reg:squarederror', "subsample": 0.8, "max_depth": 20, "eta": 0.01, "colsample_bytree": 0.9,} num_boost_round = 1000
In [19]:
import matplotlib.pyplot as plt
In [20]:
plt.rcParams["font.sans-serif"] = ["SimHei"] # 设置字体 plt.rcParams["axes.unicode_minus"] = False # 正常显示负号
In [21]:
eva_total = list() index_list = list() eva_cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2'] for col in out_cols: eva_list = list() train_data = train_data[~train_data[col].isna()].reset_index(drop=True) cur_test = list() cur_real = list() for (train_index, test_index) in kf.split(train_data): train = train_data.loc[train_index] valid = train_data.loc[test_index] X_train, Y_train = train[feature_cols], train[col] X_valid, Y_valid = valid[feature_cols], valid[col] dtrain = xgb.DMatrix(X_train, Y_train) dvalid = xgb.DMatrix(X_valid, Y_valid) watchlist = [(dvalid, 'eval')] gb_model = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, verbose_eval=False) y_pred = gb_model.predict(xgb.DMatrix(X_valid)) y_true = Y_valid.values MSE = mean_squared_error(y_true, y_pred) RMSE = np.sqrt(mean_squared_error(y_true, y_pred)) MAE = mean_absolute_error(y_true, y_pred) MAPE = mean_absolute_percentage_error(y_true, y_pred) R_2 = r2_score(y_true, y_pred) cur_test.extend(y_pred[:7]) cur_real.extend(y_true[:7]) print('MSE:', round(MSE, 4), end=', ') print('RMSE:', round(RMSE, 4), end=', ') print('MAE:', round(MAE, 4), end=', ') print('MAPE:', round(MAPE*100, 2), '%', end=', ') print('R_2:', round(R_2, 4)) #R方为负就说明拟合效果比平均值差 eva_list.append([MSE, RMSE, MAE, MAPE, R_2]) plt.figure(figsize=(12, 8)) plt.plot(range(len(cur_test)), cur_real, 'o-', label='real') plt.plot(range(len(cur_test)), cur_test, '*-', label='pred') plt.legend(loc='best') plt.title(f'{col}') plt.show() eva_total.append(np.mean(eva_list, axis=0)) index_list.append(f"{col}")
MSE: 83933.6379, RMSE: 289.713, MAE: 205.8723, MAPE: 16.71 %, R_2: 0.8599 MSE: 151368.0568, RMSE: 389.0605, MAE: 331.2811, MAPE: 25.0 %, R_2: 0.8364 MSE: 179281.5189, RMSE: 423.4165, MAE: 293.9454, MAPE: 16.84 %, R_2: 0.7792 MSE: 230625.1215, RMSE: 480.2344, MAE: 288.9958, MAPE: 56.39 %, R_2: 0.5948 MSE: 212246.0972, RMSE: 460.7017, MAE: 312.8322, MAPE: 39.54 %, R_2: 0.6924 MSE: 231044.2089, RMSE: 480.6706, MAE: 359.0907, MAPE: 18.98 %, R_2: 0.6907
MSE: 0.0309, RMSE: 0.1758, MAE: 0.1127, MAPE: 14.96 %, R_2: 0.8353 MSE: 0.0477, RMSE: 0.2184, MAE: 0.1858, MAPE: 23.49 %, R_2: 0.8287 MSE: 0.0656, RMSE: 0.2561, MAE: 0.1692, MAPE: 17.14 %, R_2: 0.8098 MSE: 0.0338, RMSE: 0.184, MAE: 0.122, MAPE: 18.98 %, R_2: 0.7735 MSE: 0.0511, RMSE: 0.2261, MAE: 0.1652, MAPE: 36.1 %, R_2: 0.8148 MSE: 0.0684, RMSE: 0.2615, MAE: 0.192, MAPE: 18.13 %, R_2: 0.7924
MSE: 0.0185, RMSE: 0.1362, MAE: 0.0952, MAPE: 18.49 %, R_2: 0.7329 MSE: 0.0503, RMSE: 0.2242, MAE: 0.1338, MAPE: 21.21 %, R_2: 0.6868 MSE: 0.0768, RMSE: 0.2771, MAE: 0.1638, MAPE: 26.24 %, R_2: 0.4523 MSE: 0.0222, RMSE: 0.1489, MAE: 0.0975, MAPE: 21.11 %, R_2: 0.5256 MSE: 0.0395, RMSE: 0.1987, MAE: 0.1253, MAPE: 42.88 %, R_2: 0.5666 MSE: 0.0525, RMSE: 0.229, MAE: 0.1566, MAPE: 23.66 %, R_2: 0.2799
In [22]:
pd.DataFrame.from_records(eva_total, index=index_list, columns=eva_cols)
Out[22]:
MSE | RMSE | MAE | MAPE | R2 | |
---|---|---|---|---|---|
比表面积 | 181416.440212 | 420.632794 | 298.669574 | 0.289103 | 0.742225 |
总孔体积 | 0.049592 | 0.220315 | 0.157799 | 0.214682 | 0.809092 |
微孔体积 | 0.043286 | 0.202355 | 0.128714 | 0.255982 | 0.540697 |
In [ ]: