import pandas as pd
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
import numpy as np


data = pd.read_excel('./data/20240617/电容性能新.xlsx')
data.head()


out_cols = ['比电容']


feature_cols = [x for x in data.columns if x not in out_cols]


train_data = data.reset_index(drop=True)


import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score


# 定义超参数的搜索空间
space = {
    'eta': hp.loguniform('eta', -5, 0),  # 学习率，搜索范围是 [1e-5, 1]
    'max_depth': hp.choice('max_depth', range(5, 30)),  # 树的最大深度，搜索范围是 [1, 10]
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),  # 子节点最小的权重和
    'gamma': hp.loguniform('gamma', -5, 0),  # 叶子节点分裂所需的最小损失减少
    'subsample': hp.uniform('subsample', 0.5, 1),  # 训练集的采样率
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),  # 特征的采样率
}


# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(train_data[feature_cols], 
                                                    train_data[out_cols], 
                                                    test_size=0.3, random_state=42)


# 定义目标函数，用于评估模型的性能
def objective(params):
    # 创建决策树分类器实例
    gbr = xgb.XGBRegressor(**params)
    # 训练模型
    gbr.fit(X_train, y_train)
    # 使用模型进行预测
    y_pred = gbr.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    return {'loss': mae, 'status': STATUS_OK}


# 创建 Trials 对象来记录搜索历史
trials = Trials()

# 使用 fmin 函数进行超参数优化
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████████| 100/100 [00:08<00:00, 11.55trial/s, best loss: 12.132344347686164]


print(best_params)

{'colsample_bytree': 0.8857035476046763, 'eta': 0.11588664776521924, 'gamma': 0.007847746718601799, 'max_depth': 10, 'min_child_weight': 6.396614191886977, 'subsample': 0.7070880429614513}


from sklearn.model_selection import KFold, train_test_split
kf = KFold(n_splits=10, shuffle=True, random_state=42)


num_boost_round = 1000


import matplotlib.pyplot as plt


plt.rcParams["font.sans-serif"] = ["SimHei"]  # 设置字体
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号


eva_list = list()
eva_cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2']
for (train_index, test_index) in kf.split(train_data):
    train = train_data.loc[train_index]
    valid = train_data.loc[test_index]
    X_train, Y_train = train[feature_cols], train[out_cols]
    X_valid, Y_valid = valid[feature_cols], valid[out_cols]
    dtrain = xgb.DMatrix(X_train, Y_train)
    dvalid = xgb.DMatrix(X_valid, Y_valid)
    watchlist = [(dvalid, 'eval')]
    gb_model = xgb.train(best_params, dtrain, num_boost_round, evals=watchlist,
                    early_stopping_rounds=100, verbose_eval=False)
    y_pred = gb_model.predict(xgb.DMatrix(X_valid))
    y_true = Y_valid.values
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
    MAE = mean_absolute_error(y_true, y_pred)
    MAPE = mean_absolute_percentage_error(y_true, y_pred)
    R_2 = r2_score(y_true, y_pred)
    print('MSE:', round(MSE, 4), end=', ')
    print('RMSE:', round(RMSE, 4), end=', ')
    print('MAE:', round(MAE, 4), end=', ')
    print('MAPE:', round(MAPE*100, 2), '%', end=', ')
    print('R_2:', round(R_2, 4))  #R方为负就说明拟合效果比平均值差
    eva_list.append([MSE, RMSE, MAE, MAPE, R_2])

MSE: 164.2816, RMSE: 12.8172, MAE: 9.1819, MAPE: 4.33 %, R_2: 0.9534
MSE: 172.8146, RMSE: 13.1459, MAE: 8.4597, MAPE: 4.24 %, R_2: 0.9475
MSE: 105.637, RMSE: 10.278, MAE: 7.1138, MAPE: 3.19 %, R_2: 0.9736
MSE: 306.2548, RMSE: 17.5001, MAE: 10.3353, MAPE: 4.27 %, R_2: 0.9348
MSE: 212.1827, RMSE: 14.5665, MAE: 10.452, MAPE: 4.64 %, R_2: 0.9467
MSE: 311.2193, RMSE: 17.6414, MAE: 10.62, MAPE: 3.97 %, R_2: 0.929
MSE: 479.0079, RMSE: 21.8862, MAE: 11.6752, MAPE: 5.11 %, R_2: 0.8952
MSE: 153.6563, RMSE: 12.3958, MAE: 8.8708, MAPE: 4.44 %, R_2: 0.9502
MSE: 285.905, RMSE: 16.9087, MAE: 10.4152, MAPE: 5.35 %, R_2: 0.9522
MSE: 570.9538, RMSE: 23.8946, MAE: 12.4216, MAPE: 5.98 %, R_2: 0.8954


eva_df = pd.DataFrame.from_records(eva_list, columns=eva_cols)


eva_df.mean()

MSE     276.191297
RMSE     16.103459
MAE       9.954548
MAPE      0.045525
R2        0.937810
dtype: float64


plt.figure(figsize=(12, 8))
plt.plot(range(len(y_true)), y_true, 'o-', label='real')
plt.plot(range(len(y_pred)), y_pred, '*-', label='pred')
plt.legend(loc='best')
plt.title(f'{out_cols}')
plt.show()

	比表面积	总孔体积	微孔体积	平均孔径	氮掺杂量at	氧掺杂量	ID/IG	电流密度	比电容
0	1141.8	0.46	0.42	1.61	1.74	3.84	1.1	0.5	206.5
1	1141.8	0.46	0.42	1.61	1.74	3.84	1.1	1.0	179.1
2	1141.8	0.46	0.42	1.61	1.74	3.84	1.1	2.0	163.3
3	1141.8	0.46	0.42	1.61	1.74	3.84	1.1	5.0	146.0
4	1141.8	0.46	0.42	1.61	1.74	3.84	1.1	10.0	137.8