coal_materials/20231227-checkpoint.ipynb at c32ec7fe1542705332968c48b5c67a75a8c9ebb0

669 KiB

Raw Blame History

In [1]:

import pandas as pd

In [2]:

data = pd.read_excel('./data/20240102/20231227.xlsx', header=[0, 1])
data.head()

Out[2]:

	Unnamed: 0_level_0	原料特征		热处理条件		第一次热处理			第二次热处理		...	模板剂	活化剂		混合方式	碳材料结构特征					参考文献
	原料	是否预处理	软化点	热处理次数	是否是中温停留	温度	升温速率	保留时间	温度	升温速率·	...	与沥青比例	是否KOH活化	比例	混合方式	比表面积	总孔体积	微孔体积	平均孔径	平均孔径.1	Unnamed: 25_level_1
0	煤沥青	否	NaN	1	否	500	5	2	NaN	NaN	...	1.0	是	1.0	溶剂	908.07	0.40	0.34	1.75	NaN	姜宇晨硬模板法制备煤焦油沥青基多孔碳及其超电性能研究
1	煤沥青	否	NaN	1	否	600	5	2	NaN	NaN	...	1.0	是	0.5	溶剂	953.95	0.66	0.35	2.76	NaN	NaN
2	煤沥青	否	NaN	1	否	600	5	2	NaN	NaN	...	1.0	是	1.0	溶剂	1388.62	0.61	0.53	1.77	NaN	NaN
3	煤沥青	否	NaN	1	否	600	5	2	NaN	NaN	...	1.0	是	2.0	溶剂	1444.63	0.59	0.55	1.62	NaN	NaN
4	煤沥青	否	NaN	2	否	500	5	2	600.0	5.0	...	1.0	是	1.0	溶剂	1020.99	0.45	0.35	1.77	NaN	NaN

5 rows × 26 columns

In [3]:

data.columns = ['-'.join([y for y in x if 'Unnamed' not in y]) for x in data.columns]

In [4]:

ignore_cols = data.columns[:3].tolist() + data.columns[-2:].tolist()
data.drop(columns=ignore_cols, inplace=True)

In [34]:

data.columns

Out[34]:

Index(['热处理条件-热处理次数', '热处理条件-是否是中温停留', '第一次热处理-温度', '第一次热处理-升温速率',
       '第一次热处理-保留时间', '第二次热处理-温度', '第二次热处理-升温速率·', '第二次热处理-保留时间',
       '共碳化-是否是共碳化物质', '共碳化-共碳化物质/沥青', '模板剂-与沥青比例', '活化剂-是否KOH活化', '活化剂-比例',
       '混合方式-混合方式', '碳材料结构特征-比表面积', '碳材料结构特征-总孔体积', '碳材料结构特征-微孔体积',
       '碳材料结构特征-平均孔径', '共碳化-种类_2-甲基咪唑', '共碳化-种类_三聚氰胺', '共碳化-种类_尿素',
       '共碳化-种类_硫酸铵', '共碳化-种类_聚磷酸铵', '模板剂-模板剂制备方式_无', '模板剂-模板剂制备方式_溶液合成',
       '模板剂-模板剂制备方式_热分解', '模板剂-模板剂制备方式_直接购买', '模板剂-模板剂制备方式_自己合成',
       '模板剂-种类_Al2O3', '模板剂-种类_TiO2', '模板剂-种类_α-Fe2O3', '模板剂-种类_γ-Fe2O3',
       '模板剂-种类_二氧化硅', '模板剂-种类_氢氧化镁', '模板剂-种类_氧化钙', '模板剂-种类_氧化锌', '模板剂-种类_氧化镁',
       '模板剂-种类_氯化钠', '模板剂-种类_氯化钾', '模板剂-种类_碱式碳酸镁', '模板剂-种类_碳酸钙', '模板剂-种类_纤维素'],
      dtype='object')

In [5]:

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ['热处理条件-热处理次数', '热处理条件-是否是中温停留', '共碳化-是否是共碳化物质', '活化剂-是否KOH活化', '混合方式-混合方式']:
    encoded_labels = encoder.fit_transform(data[col])
    data[col] = encoded_labels

In [6]:

object_cols = ['共碳化-种类', '模板剂-模板剂制备方式', '模板剂-种类']

In [7]:

data = pd.get_dummies(data, columns=object_cols)

In [8]:

out_cols = [x for x in data.columns if '碳材料结构特征' in x]
feature_cols = [x for x in data.columns if x not in out_cols]

In [9]:

train_data = data.reset_index(drop=True)

In [10]:

import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [11]:

from sklearn.model_selection import KFold, train_test_split
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [12]:

import numpy as np

In [31]:

params_xgb = {"objective": 'reg:squarederror',
             "subsample": 0.9,
             "max_depth": 15,
             "eta": 0.05,
             "gamma": 0,
             "lambda": 1,
             "alpha": 0,
             "colsample_bytree": 0.8,}
num_boost_round = 1000

In [14]:

import matplotlib.pyplot as plt

In [15]:

plt.rcParams["font.sans-serif"] = ["SimHei"]  # 设置字体
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

In [32]:

eva_total = list()
index_list = list()
eva_cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2']
for col in out_cols:
    eva_list = list()
    train_data = train_data[~train_data[col].isna()].reset_index(drop=True)
    cur_test = list()
    cur_real = list()
    for (train_index, test_index) in kf.split(train_data):
        train = train_data.loc[train_index]
        valid = train_data.loc[test_index]
        X_train, Y_train = train[feature_cols], np.log1p(train[col])
        X_valid, Y_valid = valid[feature_cols], np.log1p(valid[col])
        dtrain = xgb.DMatrix(X_train, Y_train)
        dvalid = xgb.DMatrix(X_valid, Y_valid)
        watchlist = [(dvalid, 'eval')]
        gb_model = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist,
                        early_stopping_rounds=50, verbose_eval=False)
        y_pred = np.expm1(gb_model.predict(xgb.DMatrix(X_valid)))
        y_true = np.expm1(Y_valid.values)
        MSE = mean_squared_error(y_true, y_pred)
        RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
        MAE = mean_absolute_error(y_true, y_pred)
        MAPE = mean_absolute_percentage_error(y_true, y_pred)
        R_2 = r2_score(y_true, y_pred)
        cur_test.extend(y_pred[:7])
        cur_real.extend(y_true[:7])
        # print('MSE:', round(MSE, 4), end=', ')
        # print('RMSE:', round(RMSE, 4), end=', ')
        # print('MAE:', round(MAE, 4), end=', ')
        # print('MAPE:', round(MAPE*100, 2), '%', end=', ')
        # print('R_2:', round(R_2, 4))  #R方为负就说明拟合效果比平均值差
        eva_list.append([MSE, RMSE, MAE, MAPE, R_2])
    plt.figure(figsize=(12, 8))
    plt.plot(range(len(cur_test)), cur_real, 'o-', label='real')
    plt.plot(range(len(cur_test)), cur_test, '*-', label='pred')
    plt.legend(loc='best')
    plt.title(f'{col}')
    plt.show()
    eva_total.append(np.mean(eva_list, axis=0))
    index_list.append(f"{col}")

No description has been provided for this image

In [33]:

pd.DataFrame.from_records(eva_total, index=index_list, columns=eva_cols)

Out[33]:

	MSE	RMSE	MAE	MAPE	R2
碳材料结构特征-比表面积	303184.717382	544.657385	421.657610	0.847088	0.372696
碳材料结构特征-总孔体积	0.128721	0.354206	0.280979	0.900388	0.435381
碳材料结构特征-微孔体积	0.042627	0.205114	0.160595	3.522258	0.497659
碳材料结构特征-平均孔径	0.675843	0.763879	0.509585	0.172561	0.307433

In [ ]:

669 KiB Raw Blame History Unescape Escape

669 KiB

Raw Blame History