import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split import os import chardet import joblib from sklearn.metrics import ( mean_squared_error, mean_absolute_error, median_absolute_error, mean_absolute_percentage_error, r2_score ) # 读取文件 current_directory = os.getcwd() print(current_directory) def get_orign_data(): save_directory = os.path.join(current_directory,'jiawanyuce\data') # with open(os.path.join(save_directory, 'inflowAll.csv'), 'rb') as f: # raw_data = f.read() # result = chardet.detect(raw_data) # encoding = result['encoding'] inflow = pd.read_csv(os.path.join(save_directory,'inflowAll.csv'),encoding= 'utf-8') inflow.columns = inflow.iloc[1,:] # ['糖类浓度','蛋白质浓度','脂质浓度','颗粒惰性物质','输入量'] inflow = inflow.loc[2:, ['X_ch','X_pr','X_li','X_I','Q']] inflow[['X_ch','X_pr','X_li','X_I','Q']] = inflow[['X_ch','X_pr','X_li','X_I','Q']].astype('float32') inflow = inflow.reset_index(drop=True) inflow = inflow.reset_index() inflow['index'] = inflow.index.map(lambda x: x+1) outflow = pd.read_csv(os.path.join(save_directory,'outFlow.csv'),encoding= 'utf-8') outflow.columns = outflow.iloc[1,:] outflow = outflow.loc[2:,:] outflow = outflow.apply(pd.to_numeric, errors = 'raise') outflow.drop(['X_ch','X_pr','X_li','X_I','S_gas_ch4'], axis=1, inplace=True) outflow = outflow.reset_index(drop=True) outflow = outflow.reset_index() outflow['index'] = outflow.index.map(lambda x: x+1) ch4gasall = pd.read_csv(os.path.join(save_directory,'ch4GasAll.csv'),encoding= 'utf-8') ch4gasall.columns = ch4gasall.iloc[0,:] ch4gasall = ch4gasall.loc[1:,:] ch4gasall = ch4gasall.apply(pd.to_numeric, errors = 'raise') ch4gasall = ch4gasall[['甲烷流量(Nm3d-1)','气体总流量(Nm3d-1)','S_gas_ch4(kg COD m-3)']] ch4gasall.columns = ['Nm3d-1-ch4','Nm3d-1-all','S_gas_ch4'] ch4gasall = ch4gasall.reset_index(drop=True) ch4gasall = ch4gasall.reset_index() ch4gasall['index'] = ch4gasall.index.map(lambda x: x+1) data = inflow.merge(outflow,how='left',on=['index']) data = data.merge(ch4gasall,how='left',on=['index']) # ['糖类浓度','蛋白质浓度','脂质浓度','颗粒惰性物质','输入量','pH','甲烷流量' , 'S_gas_ch4'] data = data[['index','X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4']] return data def data_feature(data,x_cols): for name in x_cols: data[name+"_15_first"] = data[name].shift(1) data[name+"_30_first"] = data[name].shift(2) data[name+"_45_first"] = data[name].shift(3) data[name+"_60_first"] = data[name].shift(4) data[name+"_1h_mean"] = data[name].rolling(4).mean() data[name+"_1h_max"] = data[name].rolling(4).max() data[name+"_1h_min"] = data[name].rolling(4).min() data[name+"_1h_median"] = data[name].rolling(4).median() data[name+"_1h_std"] = data[name].rolling(4).std() data[name+"_1h_var"] = data[name].rolling(4).var() data[name+"_1h_skew"] = data[name].rolling(4).skew() data[name+"_1h_kurt"] = data[name].rolling(4).kurt() data[name+"_1_diff"] = data[name].diff(periods=1) data[name+"_2_diff"] = data[name].diff(periods=2) data[name+"_2h_mean"] = data[name].rolling(8).mean() data[name+"_2h_max"] = data[name].rolling(8).max() data[name+"_2h_min"] = data[name].rolling(8).min() data[name+"_2h_median"] = data[name].rolling(8).median() data[name+"_2h_std"] = data[name].rolling(8).std() data[name+"_2h_var"] = data[name].rolling(8).var() data[name+"_2h_skew"] = data[name].rolling(8).skew() data[name+"_2h_kurt"] = data[name].rolling(8).kurt() # 不想要日均的了,太长了 for name in x_cols: data[name+"_d_mean"] = data[name].rolling(4*24).mean() data[name+"_d_max"] = data[name].rolling(4*24).max() data[name+"_d_min"] = data[name].rolling(4*24).min() data[name+"_d_median"] = data[name].rolling(4).median() data[name+"_d_std"] = data[name].rolling(4*24).std() data[name+"_d_var"] = data[name].rolling(4*24).var() data[name+"_d_skew"] = data[name].rolling(4*24).skew() data[name+"_d_kurt"] = data[name].rolling(4*24).kurt() return data def data_result(data,target_1): data[target_1+"_1_after"] = data[target_1].shift(-1) data[target_1+"_2_after"] = data[target_1].shift(-2) data[target_1+"_3_after"] = data[target_1].shift(-3) data[target_1+"_4_after"] = data[target_1].shift(-4) return data def regression_metrics(true, pred): print('回归模型评估指标结果:') # 确保 true 和 pred 是 numpy 数组 true = np.array(true) pred = np.array(pred) # 检查形状是否一致 if true.shape != pred.shape: raise ValueError("真实值和预测值的形状不一致: {} vs {}".format(true.shape, pred.shape)) # 计算每一列的指标 for i in range(true.shape[1]): print(f'对于第 {i+1} 列:') print('均方误差【MSE】:', mean_squared_error(true[:, i], pred[:, i])) print('均方根误差【RMSE】:', np.sqrt(mean_squared_error(true[:, i], pred[:, i]))) print('平均绝对误差【MAE】:', mean_absolute_error(true[:, i], pred[:, i])) print('绝对误差中位数【MedianAE】:', median_absolute_error(true[:, i], pred[:, i])) print('平均绝对百分比误差【MAPE】:', mean_absolute_percentage_error(true[:, i], pred[:, i])) print("R2 【R2 score】:", r2_score(true[:, i], pred[:, i])) print('-----------------------') feature_list = ['X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4', 'X_ch_15_first','X_ch_30_first','X_ch_45_first','X_ch_60_first', 'X_ch_1h_mean','X_ch_1h_max','X_ch_1h_min','X_ch_1h_median','X_ch_1h_std', 'X_ch_1h_var','X_ch_1h_skew','X_ch_1h_kurt','X_ch_1_diff','X_ch_2_diff', 'X_ch_2h_mean','X_ch_2h_max','X_ch_2h_min','X_ch_2h_median','X_ch_2h_std', 'X_ch_2h_var','X_ch_2h_skew','X_ch_2h_kurt','X_pr_15_first', 'X_pr_30_first','X_pr_45_first','X_pr_60_first','X_pr_1h_mean', 'X_pr_1h_max','X_pr_1h_min','X_pr_1h_median','X_pr_1h_std','X_pr_1h_var', 'X_pr_1h_skew','X_pr_1h_kurt','X_pr_1_diff','X_pr_2_diff','X_pr_2h_mean', 'X_pr_2h_max','X_pr_2h_min','X_pr_2h_median','X_pr_2h_std','X_pr_2h_var', 'X_pr_2h_skew','X_pr_2h_kurt','X_li_15_first','X_li_30_first', 'X_li_45_first','X_li_60_first','X_li_1h_mean','X_li_1h_max', 'X_li_1h_min','X_li_1h_median','X_li_1h_std','X_li_1h_var','X_li_1h_skew', 'X_li_1h_kurt','X_li_1_diff','X_li_2_diff','X_li_2h_mean','X_li_2h_max', 'X_li_2h_min','X_li_2h_median','X_li_2h_std','X_li_2h_var','X_li_2h_skew', 'X_li_2h_kurt','X_I_15_first','X_I_30_first','X_I_45_first', 'X_I_60_first','X_I_1h_mean','X_I_1h_max','X_I_1h_min','X_I_1h_median', 'X_I_1h_std','X_I_1h_var','X_I_1h_skew','X_I_1h_kurt','X_I_1_diff', 'X_I_2_diff','X_I_2h_mean','X_I_2h_max','X_I_2h_min','X_I_2h_median', 'X_I_2h_std','X_I_2h_var','X_I_2h_skew','X_I_2h_kurt','Q_15_first', 'Q_30_first','Q_45_first','Q_60_first','Q_1h_mean','Q_1h_max','Q_1h_min', 'Q_1h_median','Q_1h_std','Q_1h_var','Q_1h_skew','Q_1h_kurt','Q_1_diff', 'Q_2_diff','Q_2h_mean','Q_2h_max','Q_2h_min','Q_2h_median','Q_2h_std', 'Q_2h_var','Q_2h_skew','Q_2h_kurt','pH_15_first','pH_30_first', 'pH_45_first','pH_60_first','pH_1h_mean','pH_1h_max','pH_1h_min', 'pH_1h_median','pH_1h_std','pH_1h_var','pH_1h_skew','pH_1h_kurt', 'pH_1_diff','pH_2_diff','pH_2h_mean','pH_2h_max','pH_2h_min', 'pH_2h_median','pH_2h_std','pH_2h_var','pH_2h_skew','pH_2h_kurt', 'Nm3d-1-ch4_15_first','Nm3d-1-ch4_30_first','Nm3d-1-ch4_45_first', 'Nm3d-1-ch4_60_first','Nm3d-1-ch4_1h_mean','Nm3d-1-ch4_1h_max', 'Nm3d-1-ch4_1h_min','Nm3d-1-ch4_1h_median','Nm3d-1-ch4_1h_std', 'Nm3d-1-ch4_1h_var','Nm3d-1-ch4_1h_skew','Nm3d-1-ch4_1h_kurt', 'Nm3d-1-ch4_1_diff','Nm3d-1-ch4_2_diff','Nm3d-1-ch4_2h_mean', 'Nm3d-1-ch4_2h_max','Nm3d-1-ch4_2h_min','Nm3d-1-ch4_2h_median', 'Nm3d-1-ch4_2h_std','Nm3d-1-ch4_2h_var','Nm3d-1-ch4_2h_skew', 'Nm3d-1-ch4_2h_kurt','S_gas_ch4_15_first','S_gas_ch4_30_first', 'S_gas_ch4_45_first','S_gas_ch4_60_first','S_gas_ch4_1h_mean', 'S_gas_ch4_1h_max','S_gas_ch4_1h_min','S_gas_ch4_1h_median', 'S_gas_ch4_1h_std','S_gas_ch4_1h_var','S_gas_ch4_1h_skew', 'S_gas_ch4_1h_kurt','S_gas_ch4_1_diff','S_gas_ch4_2_diff', 'S_gas_ch4_2h_mean','S_gas_ch4_2h_max','S_gas_ch4_2h_min', 'S_gas_ch4_2h_median','S_gas_ch4_2h_std','S_gas_ch4_2h_var', 'S_gas_ch4_2h_skew','S_gas_ch4_2h_kurt','X_ch_d_mean','X_ch_d_max', 'X_ch_d_min','X_ch_d_median','X_ch_d_std','X_ch_d_var','X_ch_d_skew', 'X_ch_d_kurt','X_pr_d_mean','X_pr_d_max','X_pr_d_min','X_pr_d_median', 'X_pr_d_std','X_pr_d_var','X_pr_d_skew','X_pr_d_kurt','X_li_d_mean', 'X_li_d_max','X_li_d_min','X_li_d_median','X_li_d_std','X_li_d_var', 'X_li_d_skew','X_li_d_kurt','X_I_d_mean','X_I_d_max','X_I_d_min', 'X_I_d_median','X_I_d_std','X_I_d_var','X_I_d_skew','X_I_d_kurt', 'Q_d_mean','Q_d_max','Q_d_min','Q_d_median','Q_d_std','Q_d_var', 'Q_d_skew','Q_d_kurt','pH_d_mean','pH_d_max','pH_d_min','pH_d_median', 'pH_d_std','pH_d_var','pH_d_skew','pH_d_kurt','Nm3d-1-ch4_d_mean', 'Nm3d-1-ch4_d_max','Nm3d-1-ch4_d_min','Nm3d-1-ch4_d_median', 'Nm3d-1-ch4_d_std','Nm3d-1-ch4_d_var','Nm3d-1-ch4_d_skew', 'Nm3d-1-ch4_d_kurt','S_gas_ch4_d_mean','S_gas_ch4_d_max', 'S_gas_ch4_d_min','S_gas_ch4_d_median','S_gas_ch4_d_std', 'S_gas_ch4_d_var','S_gas_ch4_d_skew','S_gas_ch4_d_kurt'] target_1_list = ['Nm3d-1-ch4_1_after', 'Nm3d-1-ch4_2_after', 'Nm3d-1-ch4_3_after','Nm3d-1-ch4_4_after'] target_2_list = [ 'S_gas_ch4_1_after', 'S_gas_ch4_2_after','S_gas_ch4_3_after', 'S_gas_ch4_4_after' ] if __name__ == "__main__": data = get_orign_data() x_cols = ['X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4'] target_1 = 'Nm3d-1-ch4' target_2 = 'S_gas_ch4' data = data_feature(data, x_cols) data = data_result(data, target_1) data = data_result(data, target_2) data = data.iloc[95:,:] data = data.iloc[95:,:] data = data.iloc[:-4,:] # 拆分训练集,测试集 test_data = data.iloc[-14*24*4:,:] train_data = data.iloc[:-14*24*4,:] # 将测试数据保留,用于样例数据 test_data = test_data[['index','X_ch','X_pr','X_li','X_I','Q','pH','Nm3d-1-ch4','S_gas_ch4']] test_data.to_csv(os.path.join(current_directory,'jiawanyuce/data/jiawan_test.csv'),index=False) # 流量模型训练保存 X, y = np.array(train_data[feature_list].values), np.array(train_data[target_1_list].values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建 DMatrix 数据集 dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # 设置参数 params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'eta': 0.1, 'max_depth': 3, } # 训练模型 model = xgb.train(params, dtrain, num_boost_round=100) # 进行预测 predictions = model.predict(dtest) # 输出预测结果 # print(predictions) regression_metrics(predictions,y_test) # 模型保存 joblib.dump(model, os.path.join(current_directory,'jiawanyuce/liuliang_model/xgb_model_liuliang.pkl')) """流量模型训练保存""" X, y = np.array(train_data[feature_list].values), np.array(train_data[target_2_list].values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建 DMatrix 数据集 dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # 设置参数 params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'eta': 0.1, 'max_depth': 3, } # 训练模型 model = xgb.train(params, dtrain, num_boost_round=100) # 进行预测 predictions = model.predict(dtest) # 输出预测结果 # print(predictions) regression_metrics(predictions,y_test) # 模型保存 joblib.dump(model, os.path.join(current_directory,'jiawanyuce/qixiangnongdu_model/xgb_model_qixiangnongdu.pkl'))