coal_materials/20231227.ipynb at master

666 KiB

Raw Permalink Blame History

In [1]:

import pandas as pd

In [2]:

data = pd.read_excel('./data/20240102/20231227.xlsx', header=[0, 1])
data.head()

Out[2]:

	Unnamed: 0_level_0	原料特征		热处理条件		第一次热处理			第二次热处理		...	模板剂	活化剂		混合方式	碳材料结构特征					参考文献
	原料	是否预处理	软化点	热处理次数	是否是中温停留	温度	升温速率	保留时间	温度	升温速率·	...	与沥青比例	是否KOH活化	比例	混合方式	比表面积	总孔体积	微孔体积	平均孔径	平均孔径.1	Unnamed: 25_level_1
0	煤沥青	否	NaN	1	否	500	5	2	NaN	NaN	...	1.0	是	1.0	溶剂	908.07	0.40	0.34	1.75	NaN	姜宇晨硬模板法制备煤焦油沥青基多孔碳及其超电性能研究
1	煤沥青	否	NaN	1	否	600	5	2	NaN	NaN	...	1.0	是	0.5	溶剂	953.95	0.66	0.35	2.76	NaN	NaN
2	煤沥青	否	NaN	1	否	600	5	2	NaN	NaN	...	1.0	是	1.0	溶剂	1388.62	0.61	0.53	1.77	NaN	NaN
3	煤沥青	否	NaN	1	否	600	5	2	NaN	NaN	...	1.0	是	2.0	溶剂	1444.63	0.59	0.55	1.62	NaN	NaN
4	煤沥青	否	NaN	2	否	500	5	2	600.0	5.0	...	1.0	是	1.0	溶剂	1020.99	0.45	0.35	1.77	NaN	NaN

5 rows × 26 columns

In [3]:

data.shape

Out[3]:

(149, 26)

In [4]:

data.columns = ['-'.join([y for y in x if 'Unnamed' not in y]) for x in data.columns]

In [5]:

ignore_cols = data.columns[:3].tolist() + data.columns[-2:].tolist()
data.drop(columns=ignore_cols, inplace=True)

In [6]:

data.columns

Out[6]:

Index(['热处理条件-热处理次数', '热处理条件-是否是中温停留', '第一次热处理-温度', '第一次热处理-升温速率',
       '第一次热处理-保留时间', '第二次热处理-温度', '第二次热处理-升温速率·', '第二次热处理-保留时间',
       '共碳化-是否是共碳化物质', '共碳化-种类', '共碳化-共碳化物质/沥青', '模板剂-模板剂制备方式', '模板剂-种类',
       '模板剂-与沥青比例', '活化剂-是否KOH活化', '活化剂-比例', '混合方式-混合方式', '碳材料结构特征-比表面积',
       '碳材料结构特征-总孔体积', '碳材料结构特征-微孔体积', '碳材料结构特征-平均孔径'],
      dtype='object')

In [7]:

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ['热处理条件-热处理次数', '热处理条件-是否是中温停留', '共碳化-是否是共碳化物质', '活化剂-是否KOH活化', '混合方式-混合方式']:
    encoded_labels = encoder.fit_transform(data[col])
    data[col] = encoded_labels

In [8]:

object_cols = ['共碳化-种类', '模板剂-模板剂制备方式', '模板剂-种类']

In [9]:

data = pd.get_dummies(data, columns=object_cols)

In [10]:

out_cols = [x for x in data.columns if '碳材料结构特征' in x]
feature_cols = [x for x in data.columns if x not in out_cols]

In [11]:

train_data = data.reset_index(drop=True)

In [12]:

train_data.shape

Out[12]:

(149, 42)

In [14]:

train_data.to_csv('./train_data.csv', index=False)

In [11]:

import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [12]:

from sklearn.model_selection import KFold, train_test_split
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [13]:

import numpy as np

In [14]:

params_xgb = {"objective": 'reg:squarederror',
             "subsample": 0.9,
             "max_depth": 15,
             "eta": 0.05,
             "gamma": 0,
             "lambda": 1,
             "alpha": 0,
             "colsample_bytree": 0.8,}
num_boost_round = 1000

In [15]:

import matplotlib.pyplot as plt

In [16]:

plt.rcParams["font.sans-serif"] = ["SimHei"]  # 设置字体
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

In [17]:

eva_total = list()
index_list = list()
eva_cols = ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2']
for col in out_cols:
    eva_list = list()
    train_data = train_data[~train_data[col].isna()].reset_index(drop=True)
    cur_test = list()
    cur_real = list()
    for (train_index, test_index) in kf.split(train_data):
        train = train_data.loc[train_index]
        valid = train_data.loc[test_index]
        X_train, Y_train = train[feature_cols], np.log1p(train[col])
        X_valid, Y_valid = valid[feature_cols], np.log1p(valid[col])
        dtrain = xgb.DMatrix(X_train, Y_train)
        dvalid = xgb.DMatrix(X_valid, Y_valid)
        watchlist = [(dvalid, 'eval')]
        gb_model = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist,
                        early_stopping_rounds=50, verbose_eval=False)
        y_pred = np.expm1(gb_model.predict(xgb.DMatrix(X_valid)))
        y_true = np.expm1(Y_valid.values)
        MSE = mean_squared_error(y_true, y_pred)
        RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
        MAE = mean_absolute_error(y_true, y_pred)
        MAPE = mean_absolute_percentage_error(y_true, y_pred)
        R_2 = r2_score(y_true, y_pred)
        cur_test.extend(y_pred[:7])
        cur_real.extend(y_true[:7])
        # print('MSE:', round(MSE, 4), end=', ')
        # print('RMSE:', round(RMSE, 4), end=', ')
        # print('MAE:', round(MAE, 4), end=', ')
        # print('MAPE:', round(MAPE*100, 2), '%', end=', ')
        # print('R_2:', round(R_2, 4))  #R方为负就说明拟合效果比平均值差
        eva_list.append([MSE, RMSE, MAE, MAPE, R_2])
    plt.figure(figsize=(12, 8))
    plt.plot(range(len(cur_test)), cur_real, 'o-', label='real')
    plt.plot(range(len(cur_test)), cur_test, '*-', label='pred')
    plt.legend(loc='best')
    plt.title(f'{col}')
    plt.show()
    eva_total.append(np.mean(eva_list, axis=0))
    index_list.append(f"{col}")

No description has been provided for this image

In [18]:

pd.DataFrame.from_records(eva_total, index=index_list, columns=eva_cols)

Out[18]:

	MSE	RMSE	MAE	MAPE	R2
碳材料结构特征-比表面积	333217.197067	545.874683	419.177758	0.602743	0.369714
碳材料结构特征-总孔体积	0.115585	0.326015	0.260617	0.855863	0.426921
碳材料结构特征-微孔体积	0.047409	0.216009	0.172998	2.598594	0.487892
碳材料结构特征-平均孔径	0.675843	0.763879	0.509585	0.172561	0.307433

In [ ]:

666 KiB Raw Permalink Blame History Unescape Escape

666 KiB

Raw Permalink Blame History