ai-station-code/jiawanyuce/model_train.py

288 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import os
import chardet
import joblib
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
median_absolute_error,
mean_absolute_percentage_error,
r2_score
)
# 读取文件
current_directory = os.getcwd()
print(current_directory)
def get_orign_data():
save_directory = os.path.join(current_directory,'jiawanyuce\data')
# with open(os.path.join(save_directory, 'inflowAll.csv'), 'rb') as f:
# raw_data = f.read()
# result = chardet.detect(raw_data)
# encoding = result['encoding']
inflow = pd.read_csv(os.path.join(save_directory,'inflowAll.csv'),encoding= 'utf-8')
inflow.columns = inflow.iloc[1,:]
# ['糖类浓度','蛋白质浓度','脂质浓度','颗粒惰性物质','输入量']
inflow = inflow.loc[2:, ['X_ch','X_pr','X_li','X_I','Q']]
inflow[['X_ch','X_pr','X_li','X_I','Q']] = inflow[['X_ch','X_pr','X_li','X_I','Q']].astype('float32')
inflow = inflow.reset_index(drop=True)
inflow = inflow.reset_index()
inflow['index'] = inflow.index.map(lambda x: x+1)
outflow = pd.read_csv(os.path.join(save_directory,'outFlow.csv'),encoding= 'utf-8')
outflow.columns = outflow.iloc[1,:]
outflow = outflow.loc[2:,:]
outflow = outflow.apply(pd.to_numeric, errors = 'raise')
outflow.drop(['X_ch','X_pr','X_li','X_I','S_gas_ch4'], axis=1, inplace=True)
outflow = outflow.reset_index(drop=True)
outflow = outflow.reset_index()
outflow['index'] = outflow.index.map(lambda x: x+1)
ch4gasall = pd.read_csv(os.path.join(save_directory,'ch4GasAll.csv'),encoding= 'utf-8')
ch4gasall.columns = ch4gasall.iloc[0,:]
ch4gasall = ch4gasall.loc[1:,:]
ch4gasall = ch4gasall.apply(pd.to_numeric, errors = 'raise')
ch4gasall = ch4gasall[['甲烷流量Nm3d-1','气体总流量Nm3d-1','S_gas_ch4kg COD m-3']]
ch4gasall.columns = ['Nm3d-1-ch4','Nm3d-1-all','S_gas_ch4']
ch4gasall = ch4gasall.reset_index(drop=True)
ch4gasall = ch4gasall.reset_index()
ch4gasall['index'] = ch4gasall.index.map(lambda x: x+1)
data = inflow.merge(outflow,how='left',on=['index'])
data = data.merge(ch4gasall,how='left',on=['index'])
# ['糖类浓度','蛋白质浓度','脂质浓度','颗粒惰性物质','输入量','pH','甲烷流量' , 'S_gas_ch4']
data = data[['index','X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4']]
return data
def data_feature(data,x_cols):
for name in x_cols:
data[name+"_15_first"] = data[name].shift(1)
data[name+"_30_first"] = data[name].shift(2)
data[name+"_45_first"] = data[name].shift(3)
data[name+"_60_first"] = data[name].shift(4)
data[name+"_1h_mean"] = data[name].rolling(4).mean()
data[name+"_1h_max"] = data[name].rolling(4).max()
data[name+"_1h_min"] = data[name].rolling(4).min()
data[name+"_1h_median"] = data[name].rolling(4).median()
data[name+"_1h_std"] = data[name].rolling(4).std()
data[name+"_1h_var"] = data[name].rolling(4).var()
data[name+"_1h_skew"] = data[name].rolling(4).skew()
data[name+"_1h_kurt"] = data[name].rolling(4).kurt()
data[name+"_1_diff"] = data[name].diff(periods=1)
data[name+"_2_diff"] = data[name].diff(periods=2)
data[name+"_2h_mean"] = data[name].rolling(8).mean()
data[name+"_2h_max"] = data[name].rolling(8).max()
data[name+"_2h_min"] = data[name].rolling(8).min()
data[name+"_2h_median"] = data[name].rolling(8).median()
data[name+"_2h_std"] = data[name].rolling(8).std()
data[name+"_2h_var"] = data[name].rolling(8).var()
data[name+"_2h_skew"] = data[name].rolling(8).skew()
data[name+"_2h_kurt"] = data[name].rolling(8).kurt()
# 不想要日均的了,太长了
for name in x_cols:
data[name+"_d_mean"] = data[name].rolling(4*24).mean()
data[name+"_d_max"] = data[name].rolling(4*24).max()
data[name+"_d_min"] = data[name].rolling(4*24).min()
data[name+"_d_median"] = data[name].rolling(4).median()
data[name+"_d_std"] = data[name].rolling(4*24).std()
data[name+"_d_var"] = data[name].rolling(4*24).var()
data[name+"_d_skew"] = data[name].rolling(4*24).skew()
data[name+"_d_kurt"] = data[name].rolling(4*24).kurt()
return data
def data_result(data,target_1):
data[target_1+"_1_after"] = data[target_1].shift(-1)
data[target_1+"_2_after"] = data[target_1].shift(-2)
data[target_1+"_3_after"] = data[target_1].shift(-3)
data[target_1+"_4_after"] = data[target_1].shift(-4)
return data
def regression_metrics(true, pred):
print('回归模型评估指标结果:')
# 确保 true 和 pred 是 numpy 数组
true = np.array(true)
pred = np.array(pred)
# 检查形状是否一致
if true.shape != pred.shape:
raise ValueError("真实值和预测值的形状不一致: {} vs {}".format(true.shape, pred.shape))
# 计算每一列的指标
for i in range(true.shape[1]):
print(f'对于第 {i+1} 列:')
print('均方误差【MSE】:', mean_squared_error(true[:, i], pred[:, i]))
print('均方根误差【RMSE】:', np.sqrt(mean_squared_error(true[:, i], pred[:, i])))
print('平均绝对误差【MAE】:', mean_absolute_error(true[:, i], pred[:, i]))
print('绝对误差中位数【MedianAE】:', median_absolute_error(true[:, i], pred[:, i]))
print('平均绝对百分比误差【MAPE】:', mean_absolute_percentage_error(true[:, i], pred[:, i]))
print("R2 【R2 score】:", r2_score(true[:, i], pred[:, i]))
print('-----------------------')
feature_list = ['X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4',
'X_ch_15_first','X_ch_30_first','X_ch_45_first','X_ch_60_first',
'X_ch_1h_mean','X_ch_1h_max','X_ch_1h_min','X_ch_1h_median','X_ch_1h_std',
'X_ch_1h_var','X_ch_1h_skew','X_ch_1h_kurt','X_ch_1_diff','X_ch_2_diff',
'X_ch_2h_mean','X_ch_2h_max','X_ch_2h_min','X_ch_2h_median','X_ch_2h_std',
'X_ch_2h_var','X_ch_2h_skew','X_ch_2h_kurt','X_pr_15_first',
'X_pr_30_first','X_pr_45_first','X_pr_60_first','X_pr_1h_mean',
'X_pr_1h_max','X_pr_1h_min','X_pr_1h_median','X_pr_1h_std','X_pr_1h_var',
'X_pr_1h_skew','X_pr_1h_kurt','X_pr_1_diff','X_pr_2_diff','X_pr_2h_mean',
'X_pr_2h_max','X_pr_2h_min','X_pr_2h_median','X_pr_2h_std','X_pr_2h_var',
'X_pr_2h_skew','X_pr_2h_kurt','X_li_15_first','X_li_30_first',
'X_li_45_first','X_li_60_first','X_li_1h_mean','X_li_1h_max',
'X_li_1h_min','X_li_1h_median','X_li_1h_std','X_li_1h_var','X_li_1h_skew',
'X_li_1h_kurt','X_li_1_diff','X_li_2_diff','X_li_2h_mean','X_li_2h_max',
'X_li_2h_min','X_li_2h_median','X_li_2h_std','X_li_2h_var','X_li_2h_skew',
'X_li_2h_kurt','X_I_15_first','X_I_30_first','X_I_45_first',
'X_I_60_first','X_I_1h_mean','X_I_1h_max','X_I_1h_min','X_I_1h_median',
'X_I_1h_std','X_I_1h_var','X_I_1h_skew','X_I_1h_kurt','X_I_1_diff',
'X_I_2_diff','X_I_2h_mean','X_I_2h_max','X_I_2h_min','X_I_2h_median',
'X_I_2h_std','X_I_2h_var','X_I_2h_skew','X_I_2h_kurt','Q_15_first',
'Q_30_first','Q_45_first','Q_60_first','Q_1h_mean','Q_1h_max','Q_1h_min',
'Q_1h_median','Q_1h_std','Q_1h_var','Q_1h_skew','Q_1h_kurt','Q_1_diff',
'Q_2_diff','Q_2h_mean','Q_2h_max','Q_2h_min','Q_2h_median','Q_2h_std',
'Q_2h_var','Q_2h_skew','Q_2h_kurt','pH_15_first','pH_30_first',
'pH_45_first','pH_60_first','pH_1h_mean','pH_1h_max','pH_1h_min',
'pH_1h_median','pH_1h_std','pH_1h_var','pH_1h_skew','pH_1h_kurt',
'pH_1_diff','pH_2_diff','pH_2h_mean','pH_2h_max','pH_2h_min',
'pH_2h_median','pH_2h_std','pH_2h_var','pH_2h_skew','pH_2h_kurt',
'Nm3d-1-ch4_15_first','Nm3d-1-ch4_30_first','Nm3d-1-ch4_45_first',
'Nm3d-1-ch4_60_first','Nm3d-1-ch4_1h_mean','Nm3d-1-ch4_1h_max',
'Nm3d-1-ch4_1h_min','Nm3d-1-ch4_1h_median','Nm3d-1-ch4_1h_std',
'Nm3d-1-ch4_1h_var','Nm3d-1-ch4_1h_skew','Nm3d-1-ch4_1h_kurt',
'Nm3d-1-ch4_1_diff','Nm3d-1-ch4_2_diff','Nm3d-1-ch4_2h_mean',
'Nm3d-1-ch4_2h_max','Nm3d-1-ch4_2h_min','Nm3d-1-ch4_2h_median',
'Nm3d-1-ch4_2h_std','Nm3d-1-ch4_2h_var','Nm3d-1-ch4_2h_skew',
'Nm3d-1-ch4_2h_kurt','S_gas_ch4_15_first','S_gas_ch4_30_first',
'S_gas_ch4_45_first','S_gas_ch4_60_first','S_gas_ch4_1h_mean',
'S_gas_ch4_1h_max','S_gas_ch4_1h_min','S_gas_ch4_1h_median',
'S_gas_ch4_1h_std','S_gas_ch4_1h_var','S_gas_ch4_1h_skew',
'S_gas_ch4_1h_kurt','S_gas_ch4_1_diff','S_gas_ch4_2_diff',
'S_gas_ch4_2h_mean','S_gas_ch4_2h_max','S_gas_ch4_2h_min',
'S_gas_ch4_2h_median','S_gas_ch4_2h_std','S_gas_ch4_2h_var',
'S_gas_ch4_2h_skew','S_gas_ch4_2h_kurt','X_ch_d_mean','X_ch_d_max',
'X_ch_d_min','X_ch_d_median','X_ch_d_std','X_ch_d_var','X_ch_d_skew',
'X_ch_d_kurt','X_pr_d_mean','X_pr_d_max','X_pr_d_min','X_pr_d_median',
'X_pr_d_std','X_pr_d_var','X_pr_d_skew','X_pr_d_kurt','X_li_d_mean',
'X_li_d_max','X_li_d_min','X_li_d_median','X_li_d_std','X_li_d_var',
'X_li_d_skew','X_li_d_kurt','X_I_d_mean','X_I_d_max','X_I_d_min',
'X_I_d_median','X_I_d_std','X_I_d_var','X_I_d_skew','X_I_d_kurt',
'Q_d_mean','Q_d_max','Q_d_min','Q_d_median','Q_d_std','Q_d_var',
'Q_d_skew','Q_d_kurt','pH_d_mean','pH_d_max','pH_d_min','pH_d_median',
'pH_d_std','pH_d_var','pH_d_skew','pH_d_kurt','Nm3d-1-ch4_d_mean',
'Nm3d-1-ch4_d_max','Nm3d-1-ch4_d_min','Nm3d-1-ch4_d_median',
'Nm3d-1-ch4_d_std','Nm3d-1-ch4_d_var','Nm3d-1-ch4_d_skew',
'Nm3d-1-ch4_d_kurt','S_gas_ch4_d_mean','S_gas_ch4_d_max',
'S_gas_ch4_d_min','S_gas_ch4_d_median','S_gas_ch4_d_std',
'S_gas_ch4_d_var','S_gas_ch4_d_skew','S_gas_ch4_d_kurt']
target_1_list = ['Nm3d-1-ch4_1_after', 'Nm3d-1-ch4_2_after', 'Nm3d-1-ch4_3_after','Nm3d-1-ch4_4_after']
target_2_list = [
'S_gas_ch4_1_after', 'S_gas_ch4_2_after','S_gas_ch4_3_after', 'S_gas_ch4_4_after'
]
if __name__ == "__main__":
data = get_orign_data()
x_cols = ['X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4']
target_1 = 'Nm3d-1-ch4'
target_2 = 'S_gas_ch4'
data = data_feature(data, x_cols)
data = data_result(data, target_1)
data = data_result(data, target_2)
data = data.iloc[95:,:]
data = data.iloc[95:,:]
data = data.iloc[:-4,:]
# 拆分训练集,测试集
test_data = data.iloc[-14*24*4:,:]
train_data = data.iloc[:-14*24*4,:]
# 将测试数据保留,用于样例数据
test_data = test_data[['index','X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4']]
test_data.to_csv(os.path.join(current_directory,'jiawanyuce/data/jiawan_test.csv'),index=False)
# 流量模型训练保存
X, y = np.array(train_data[feature_list].values), np.array(train_data[target_1_list].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建 DMatrix 数据集
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 设置参数
params = {
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'eta': 0.1,
'max_depth': 3,
}
# 训练模型
model = xgb.train(params, dtrain, num_boost_round=100)
# 进行预测
predictions = model.predict(dtest)
# 输出预测结果
# print(predictions)
regression_metrics(predictions,y_test)
# 模型保存
joblib.dump(model, os.path.join(current_directory,'jiawanyuce/liuliang_model/xgb_model_liuliang.pkl'))
"""流量模型训练保存"""
X, y = np.array(train_data[feature_list].values), np.array(train_data[target_2_list].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建 DMatrix 数据集
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 设置参数
params = {
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'eta': 0.1,
'max_depth': 3,
}
# 训练模型
model = xgb.train(params, dtrain, num_boost_round=100)
# 进行预测
predictions = model.predict(dtest)
# 输出预测结果
# print(predictions)
regression_metrics(predictions,y_test)
# 模型保存
joblib.dump(model, os.path.join(current_directory,'jiawanyuce/qixiangnongdu_model/xgb_model_qixiangnongdu.pkl'))