155 KiB
155 KiB
In [ ]:
!pip install pandas hyperopt xgboost scikit-learn matplotlib numpy
In [1]:
import pandas as pd from hyperopt import hp, fmin, tpe, STATUS_OK, Trials from sklearn.model_selection import train_test_split import numpy as np
In [2]:
data = pd.read_excel('./data/20240617/电容性能新.xlsx') data.head()
Out[2]:
比表面积 | 总孔体积 | 微孔体积 | 平均孔径 | 氮掺杂量at | 氧掺杂量 | ID/IG | 电流密度 | 比电容 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 0.5 | 206.5 |
1 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 1.0 | 179.1 |
2 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 2.0 | 163.3 |
3 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 5.0 | 146.0 |
4 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 10.0 | 137.8 |
In [3]:
out_cols = ['比电容']
In [4]:
feature_cols = [x for x in data.columns if x not in out_cols]
In [5]:
train_data = data.reset_index(drop=True)
In [6]:
import xgboost as xgb from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
In [7]:
# 定义超参数的搜索空间 space = { 'eta': hp.loguniform('eta', -5, 0), # 学习率,搜索范围是 [1e-5, 1] 'max_depth': hp.choice('max_depth', range(5, 30)), # 树的最大深度,搜索范围是 [1, 10] 'min_child_weight': hp.uniform('min_child_weight', 0, 10), # 子节点最小的权重和 'gamma': hp.loguniform('gamma', -5, 0), # 叶子节点分裂所需的最小损失减少 'subsample': hp.uniform('subsample', 0.5, 1), # 训练集的采样率 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), # 特征的采样率 }
In [8]:
# 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(train_data[feature_cols], train_data[out_cols], test_size=0.3, random_state=42)
In [9]:
# 定义目标函数,用于评估模型的性能 def objective(params): # 创建决策树分类器实例 gbr = xgb.XGBRegressor(**params) # 训练模型 gbr.fit(X_train, y_train) # 使用模型进行预测 y_pred = gbr.predict(X_test) mae = mean_absolute_error(y_test, y_pred) return {'loss': mae, 'status': STATUS_OK}
In [10]:
# 创建 Trials 对象来记录搜索历史 trials = Trials() # 使用 fmin 函数进行超参数优化 best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
100%|██████████| 100/100 [00:08<00:00, 11.55trial/s, best loss: 12.132344347686164]
In [11]:
print(best_params)
{'colsample_bytree': 0.8857035476046763, 'eta': 0.11588664776521924, 'gamma': 0.007847746718601799, 'max_depth': 10, 'min_child_weight': 6.396614191886977, 'subsample': 0.7070880429614513}
In [12]:
from sklearn.model_selection import KFold, train_test_split kf = KFold(n_splits=10, shuffle=True, random_state=42)
In [13]:
num_boost_round = 1000
In [14]:
import matplotlib.pyplot as plt
In [15]:
plt.rcParams["font.sans-serif"] = ["SimHei"] # 设置字体 plt.rcParams["axes.unicode_minus"] = False # 正常显示负号
In [16]:
eva_list = list() eva_cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2'] for (train_index, test_index) in kf.split(train_data): train = train_data.loc[train_index] valid = train_data.loc[test_index] X_train, Y_train = train[feature_cols], train[out_cols] X_valid, Y_valid = valid[feature_cols], valid[out_cols] dtrain = xgb.DMatrix(X_train, Y_train) dvalid = xgb.DMatrix(X_valid, Y_valid) watchlist = [(dvalid, 'eval')] gb_model = xgb.train(best_params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, verbose_eval=False) y_pred = gb_model.predict(xgb.DMatrix(X_valid)) y_true = Y_valid.values MSE = mean_squared_error(y_true, y_pred) RMSE = np.sqrt(mean_squared_error(y_true, y_pred)) MAE = mean_absolute_error(y_true, y_pred) MAPE = mean_absolute_percentage_error(y_true, y_pred) R_2 = r2_score(y_true, y_pred) print('MSE:', round(MSE, 4), end=', ') print('RMSE:', round(RMSE, 4), end=', ') print('MAE:', round(MAE, 4), end=', ') print('MAPE:', round(MAPE*100, 2), '%', end=', ') print('R_2:', round(R_2, 4)) #R方为负就说明拟合效果比平均值差 eva_list.append([MSE, RMSE, MAE, MAPE, R_2])
MSE: 164.2816, RMSE: 12.8172, MAE: 9.1819, MAPE: 4.33 %, R_2: 0.9534 MSE: 172.8146, RMSE: 13.1459, MAE: 8.4597, MAPE: 4.24 %, R_2: 0.9475 MSE: 105.637, RMSE: 10.278, MAE: 7.1138, MAPE: 3.19 %, R_2: 0.9736 MSE: 306.2548, RMSE: 17.5001, MAE: 10.3353, MAPE: 4.27 %, R_2: 0.9348 MSE: 212.1827, RMSE: 14.5665, MAE: 10.452, MAPE: 4.64 %, R_2: 0.9467 MSE: 311.2193, RMSE: 17.6414, MAE: 10.62, MAPE: 3.97 %, R_2: 0.929 MSE: 479.0079, RMSE: 21.8862, MAE: 11.6752, MAPE: 5.11 %, R_2: 0.8952 MSE: 153.6563, RMSE: 12.3958, MAE: 8.8708, MAPE: 4.44 %, R_2: 0.9502 MSE: 285.905, RMSE: 16.9087, MAE: 10.4152, MAPE: 5.35 %, R_2: 0.9522 MSE: 570.9538, RMSE: 23.8946, MAE: 12.4216, MAPE: 5.98 %, R_2: 0.8954
In [17]:
eva_df = pd.DataFrame.from_records(eva_list, columns=eva_cols)
In [18]:
eva_df.mean()
Out[18]:
MSE 276.191297 RMSE 16.103459 MAE 9.954548 MAPE 0.045525 R2 0.937810 dtype: float64
In [19]:
plt.figure(figsize=(12, 8)) plt.plot(range(len(y_true)), y_true, 'o-', label='real') plt.plot(range(len(y_pred)), y_pred, '*-', label='pred') plt.legend(loc='best') plt.title(f'{out_cols}') plt.show()
In [ ]: