import pandas as pd
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_excel('./data/20240617/电容性能新.xlsx')
data.head()
比表面积 | 总孔体积 | 微孔体积 | 平均孔径 | 氮掺杂量at | 氧掺杂量 | ID/IG | 电流密度 | 比电容 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 0.5 | 206.5 |
1 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 1.0 | 179.1 |
2 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 2.0 | 163.3 |
3 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 5.0 | 146.0 |
4 | 1141.8 | 0.46 | 0.42 | 1.61 | 1.74 | 3.84 | 1.1 | 10.0 | 137.8 |
out_cols = ['比电容']
feature_cols = [x for x in data.columns if x not in out_cols]
train_data = data.reset_index(drop=True)
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
# 定义超参数的搜索空间
space = {
'eta': hp.loguniform('eta', -5, 0), # 学习率,搜索范围是 [1e-5, 1]
'max_depth': hp.choice('max_depth', range(5, 30)), # 树的最大深度,搜索范围是 [1, 10]
'min_child_weight': hp.uniform('min_child_weight', 0, 10), # 子节点最小的权重和
'gamma': hp.loguniform('gamma', -5, 0), # 叶子节点分裂所需的最小损失减少
'subsample': hp.uniform('subsample', 0.5, 1), # 训练集的采样率
'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), # 特征的采样率
}
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(train_data[feature_cols],
train_data[out_cols],
test_size=0.3, random_state=42)
# 定义目标函数,用于评估模型的性能
def objective(params):
# 创建决策树分类器实例
gbr = xgb.XGBRegressor(**params)
# 训练模型
gbr.fit(X_train, y_train)
# 使用模型进行预测
y_pred = gbr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
return {'loss': mae, 'status': STATUS_OK}
# 创建 Trials 对象来记录搜索历史
trials = Trials()
# 使用 fmin 函数进行超参数优化
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
100%|██████████| 100/100 [00:08<00:00, 11.55trial/s, best loss: 12.132344347686164]
print(best_params)
{'colsample_bytree': 0.8857035476046763, 'eta': 0.11588664776521924, 'gamma': 0.007847746718601799, 'max_depth': 10, 'min_child_weight': 6.396614191886977, 'subsample': 0.7070880429614513}
from sklearn.model_selection import KFold, train_test_split
kf = KFold(n_splits=10, shuffle=True, random_state=42)
num_boost_round = 1000
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ["SimHei"] # 设置字体
plt.rcParams["axes.unicode_minus"] = False # 正常显示负号
eva_list = list()
eva_cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2']
for (train_index, test_index) in kf.split(train_data):
train = train_data.loc[train_index]
valid = train_data.loc[test_index]
X_train, Y_train = train[feature_cols], train[out_cols]
X_valid, Y_valid = valid[feature_cols], valid[out_cols]
dtrain = xgb.DMatrix(X_train, Y_train)
dvalid = xgb.DMatrix(X_valid, Y_valid)
watchlist = [(dvalid, 'eval')]
gb_model = xgb.train(best_params, dtrain, num_boost_round, evals=watchlist,
early_stopping_rounds=100, verbose_eval=False)
y_pred = gb_model.predict(xgb.DMatrix(X_valid))
y_true = Y_valid.values
MSE = mean_squared_error(y_true, y_pred)
RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
MAE = mean_absolute_error(y_true, y_pred)
MAPE = mean_absolute_percentage_error(y_true, y_pred)
R_2 = r2_score(y_true, y_pred)
print('MSE:', round(MSE, 4), end=', ')
print('RMSE:', round(RMSE, 4), end=', ')
print('MAE:', round(MAE, 4), end=', ')
print('MAPE:', round(MAPE*100, 2), '%', end=', ')
print('R_2:', round(R_2, 4)) #R方为负就说明拟合效果比平均值差
eva_list.append([MSE, RMSE, MAE, MAPE, R_2])
MSE: 164.2816, RMSE: 12.8172, MAE: 9.1819, MAPE: 4.33 %, R_2: 0.9534 MSE: 172.8146, RMSE: 13.1459, MAE: 8.4597, MAPE: 4.24 %, R_2: 0.9475 MSE: 105.637, RMSE: 10.278, MAE: 7.1138, MAPE: 3.19 %, R_2: 0.9736 MSE: 306.2548, RMSE: 17.5001, MAE: 10.3353, MAPE: 4.27 %, R_2: 0.9348 MSE: 212.1827, RMSE: 14.5665, MAE: 10.452, MAPE: 4.64 %, R_2: 0.9467 MSE: 311.2193, RMSE: 17.6414, MAE: 10.62, MAPE: 3.97 %, R_2: 0.929 MSE: 479.0079, RMSE: 21.8862, MAE: 11.6752, MAPE: 5.11 %, R_2: 0.8952 MSE: 153.6563, RMSE: 12.3958, MAE: 8.8708, MAPE: 4.44 %, R_2: 0.9502 MSE: 285.905, RMSE: 16.9087, MAE: 10.4152, MAPE: 5.35 %, R_2: 0.9522 MSE: 570.9538, RMSE: 23.8946, MAE: 12.4216, MAPE: 5.98 %, R_2: 0.8954
eva_df = pd.DataFrame.from_records(eva_list, columns=eva_cols)
eva_df.mean()
MSE 276.191297 RMSE 16.103459 MAE 9.954548 MAPE 0.045525 R2 0.937810 dtype: float64
plt.figure(figsize=(12, 8))
plt.plot(range(len(y_true)), y_true, 'o-', label='real')
plt.plot(range(len(y_pred)), y_pred, '*-', label='pred')
plt.legend(loc='best')
plt.title(f'{out_cols}')
plt.show()