coal_materials/20240102.ipynb

22 KiB
Raw Blame History

In [1]:
import pandas as pd
In [6]:
data_0102 = pd.read_excel('./data/20240102/20240102.xlsx', header=[0,1,2])
data_0102
Out[6]:
Unnamed: 0_level_0 弹筒发热量 挥发分 固定炭
化验编号 Had Cad Nad Oad Qb,ad Vad Fcad
Unnamed: 0_level_2 (%) (%) (%) (%) MJ/kg (%) (%)
0 2720110529 3.93 70.18 0.81 25.079 27.820 32.06 55.68
1 2720096883 3.78 68.93 0.77 26.512 27.404 29.96 54.71
2 2720109084 3.48 69.60 0.76 26.148 27.578 29.31 55.99
3 2720084708 3.47 66.71 0.76 29.055 26.338 28.58 53.87
4 2720062721 3.87 68.78 0.80 26.542 27.280 29.97 54.78
... ... ... ... ... ... ... ... ...
223 2720030490 4.12 68.85 0.97 26.055 27.864 32.94 51.89
224 2720028633 3.97 67.04 0.94 28.043 27.368 31.88 51.38
225 2720028634 4.12 68.42 0.96 26.493 27.886 33.16 52.00
226 2720017683 3.88 67.42 0.94 27.760 26.616 31.65 50.56
227 2720017678 3.81 66.74 0.92 28.530 26.688 31.02 50.82

228 rows × 8 columns

In [10]:
cols = [''.join([y for y in x if 'Unnamed' not in y]) for x in data_0102.columns]
cols
Out[10]:
['化验编号',
 '氢Had(%)',
 '碳Cad(%)',
 '氮Nad(%)',
 '氧Oad(%)',
 '弹筒发热量Qb,adMJ/kg',
 '挥发分Vad(%)',
 '固定炭Fcad(%)']
In [11]:
data_0102.columns = cols
In [27]:
import xgboost as xgb
In [16]:
feature_cols = cols[1:6]
feature_cols
Out[16]:
['氢Had(%)', '碳Cad(%)', '氮Nad(%)', '氧Oad(%)', '弹筒发热量Qb,adMJ/kg']
In [44]:
train_data = data_0102.copy()
train_data.head()
Out[44]:
化验编号 氢Had(%) 碳Cad(%) 氮Nad(%) 氧Oad(%) 弹筒发热量Qb,adMJ/kg 挥发分Vad(%) 固定炭Fcad(%)
0 2720110529 3.93 70.18 0.81 25.079 27.820 32.06 55.68
1 2720096883 3.78 68.93 0.77 26.512 27.404 29.96 54.71
2 2720109084 3.48 69.60 0.76 26.148 27.578 29.31 55.99
3 2720084708 3.47 66.71 0.76 29.055 26.338 28.58 53.87
4 2720062721 3.87 68.78 0.80 26.542 27.280 29.97 54.78
In [19]:
import seaborn as sns
In [21]:
import numpy as np
In [23]:
train_data.reset_index(drop=True, inplace=True)
In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
In [40]:
param_xgb = {"silent": True,
             "obj": 'reg:linear',
             "subsample": 1,
             "max_depth": 15,
             "eta": 0.3,
             "gamma": 0,
             "lambda": 1,
             "alpha": 0,
             "colsample_bytree": 0.9,}
num_round = 1000
In [41]:
from sklearn.model_selection import KFold, train_test_split
kf = KFold(n_splits=10, shuffle=True, random_state=42)
In [43]:
eva_list = list()
for (train_index, test_index) in kf.split(train_data):
    train = train_data.loc[train_index]
    valid = train_data.loc[test_index]
    X_train, Y_train = train[feature_cols], np.log1p(train['挥发分Vad(%)'])
    X_valid, Y_valid = valid[feature_cols], np.log1p(valid['挥发分Vad(%)'])
    dtrain = xgb.DMatrix(X_train, Y_train)
    dvalid = xgb.DMatrix(X_valid, Y_valid)
    watchlist = [(dvalid, 'eval')]
    gb_model = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist,
                    early_stopping_rounds=50, verbose_eval=False)
    y_pred = np.expm1(gb_model.predict(xgb.DMatrix(X_valid)))
    y_true = np.expm1(Y_valid.values)
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
    MAE = mean_absolute_error(y_true, y_pred)
    MAPE = mean_absolute_percentage_error(y_true, y_pred)
    R_2 = r2_score(y_true, y_pred)
    print('MSE:', round(MSE, 4), end=', ')
    print('RMSE:', round(RMSE, 4), end=', ')
    print('MAE:', round(MAE, 4), end=', ')
    print('MAPE:', round(MAPE*100, 2), '%', end=', ')
    print('R_2:', round(R_2, 4))  #R方为负就说明拟合效果比平均值差
    eva_list.append([MSE, RMSE, MAE, MAPE, R_2])
data_df = pd.DataFrame.from_records(eva_list, columns=['MSE', 'RMSE', 'MAE', 'MAPE', 'R_2'])
data_df.mean()
MSE: 1.9436, RMSE: 1.3941, MAE: 1.1861, MAPE: 3.94 %, R_2: 0.6095
MSE: 1.8735, RMSE: 1.3688, MAE: 1.132, MAPE: 3.77 %, R_2: 0.495
MSE: 2.0587, RMSE: 1.4348, MAE: 1.0706, MAPE: 4.08 %, R_2: 0.7862
MSE: 1.9298, RMSE: 1.3892, MAE: 1.1469, MAPE: 3.84 %, R_2: 0.5332
MSE: 1.4583, RMSE: 1.2076, MAE: 1.097, MAPE: 3.67 %, R_2: 0.6894
MSE: 2.0822, RMSE: 1.443, MAE: 1.1645, MAPE: 3.88 %, R_2: 0.5975
MSE: 1.3521, RMSE: 1.1628, MAE: 0.9905, MAPE: 3.37 %, R_2: 0.7479
MSE: 1.4057, RMSE: 1.1856, MAE: 0.9998, MAPE: 3.3 %, R_2: 0.2946
MSE: 2.2274, RMSE: 1.4925, MAE: 1.2638, MAPE: 4.19 %, R_2: 0.6785
MSE: 1.4866, RMSE: 1.2193, MAE: 1.0797, MAPE: 3.67 %, R_2: 0.7261
Out[43]:
MSE     1.781792
RMSE    1.329760
MAE     1.113084
MAPE    0.037719
R_2     0.615796
dtype: float64
In [48]:
eva_list = list()
for (train_index, test_index) in kf.split(train_data):
    train = train_data.loc[train_index]
    valid = train_data.loc[test_index]
    X_train, Y_train = train[feature_cols], np.log1p(train['固定炭Fcad(%)'])
    X_valid, Y_valid = valid[feature_cols], np.log1p(valid['固定炭Fcad(%)'])
    dtrain = xgb.DMatrix(X_train, Y_train)
    dvalid = xgb.DMatrix(X_valid, Y_valid)
    watchlist = [(dvalid, 'eval')]
    gb_model = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist,
                    early_stopping_rounds=50, verbose_eval=False)
    y_pred = np.expm1(gb_model.predict(xgb.DMatrix(X_valid)))
    y_true = np.expm1(Y_valid.values)
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
    MAE = mean_absolute_error(y_true, y_pred)
    MAPE = mean_absolute_percentage_error(y_true, y_pred)
    R_2 = r2_score(y_true, y_pred)
    print('MSE:', round(MSE, 4), end=', ')
    print('RMSE:', round(RMSE, 4), end=', ')
    print('MAE:', round(MAE, 4), end=', ')
    print('MAPE:', round(MAPE*100, 2), '%', end=', ')
    print('R2:', round(R_2, 4))  #R方为负就说明拟合效果比平均值差
    eva_list.append([MSE, RMSE, MAE, MAPE, R_2])
data_df = pd.DataFrame.from_records(eva_list, columns=['MSE', 'RMSE', 'MAE', 'MAPE', 'R2'])
data_df.mean()
MSE: 4.6724, RMSE: 2.1616, MAE: 1.7297, MAPE: 3.42 %, R2: 0.8346
MSE: 3.0512, RMSE: 1.7468, MAE: 1.4485, MAPE: 2.62 %, R2: 0.8011
MSE: 7.6672, RMSE: 2.769, MAE: 1.951, MAPE: 4.56 %, R2: 0.8856
MSE: 4.0334, RMSE: 2.0083, MAE: 1.487, MAPE: 2.77 %, R2: 0.8216
MSE: 2.6382, RMSE: 1.6243, MAE: 1.1551, MAPE: 2.12 %, R2: 0.846
MSE: 5.8097, RMSE: 2.4103, MAE: 1.8683, MAPE: 3.8 %, R2: 0.83
MSE: 2.3446, RMSE: 1.5312, MAE: 1.1294, MAPE: 2.28 %, R2: 0.9069
MSE: 3.0069, RMSE: 1.734, MAE: 1.3782, MAPE: 2.46 %, R2: 0.6541
MSE: 4.1652, RMSE: 2.0409, MAE: 1.5685, MAPE: 3.2 %, R2: 0.859
MSE: 4.2023, RMSE: 2.05, MAE: 1.6284, MAPE: 3.2 %, R2: 0.869
Out[48]:
MSE     4.159107
RMSE    2.007631
MAE     1.534427
MAPE    0.030424
R2      0.830794
dtype: float64
In [67]:
 
Out[67]:
MSE RMSE MAE MAPE R2
十折交叉验证均值 4.159107 2.007631 1.534427 0.030424 0.830794
In [ ]: