697 lines
21 KiB
Python
697 lines
21 KiB
Python
import pandas as pd
|
||
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
|
||
from sklearn.model_selection import train_test_split, cross_val_score, KFold,GridSearchCV
|
||
import numpy as np
|
||
import xgboost as xgb
|
||
from sklearn.svm import SVR
|
||
from sklearn.tree import DecisionTreeRegressor
|
||
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
|
||
from sklearn.linear_model import LinearRegression,Ridge,LogisticRegression, Lasso, ElasticNet
|
||
from sklearn.neighbors import KNeighborsRegressor
|
||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel, WhiteKernel, DotProduct
|
||
from sklearn.feature_selection import SelectKBest, f_regression
|
||
import matplotlib.pyplot as plt
|
||
from joblib import dump, load
|
||
import os
|
||
|
||
# 设置随机种子以确保结果可复现
|
||
random_seed = 42
|
||
np.random.seed(random_seed)
|
||
|
||
def evaluate_model_accuracy(predict,real):
|
||
predict = np.array(predict)
|
||
real = np.array(real)
|
||
# 计算 MAE
|
||
mae = mean_absolute_error(real, predict)
|
||
# 计算 MSE
|
||
mse = mean_squared_error(real, predict)
|
||
# 计算 RMSE
|
||
rmse = np.sqrt(mse)
|
||
# 计算 MAPE
|
||
mape = np.mean(np.abs((real - predict) / real)) * 100
|
||
# 计算 R²
|
||
r2 = r2_score(real, predict)
|
||
# 返回结果
|
||
return {
|
||
'MAE': mae,
|
||
'MSE': mse,
|
||
'RMSE': rmse,
|
||
'MAPE': mape,
|
||
'R_2': r2
|
||
}
|
||
|
||
|
||
def get_data(path):
|
||
data = pd.read_csv(path)
|
||
test_size = 0.05
|
||
test_df = data.sample(frac=test_size, random_state=random_seed)
|
||
train_df = data.drop(test_df.index)
|
||
return train_df,test_df
|
||
|
||
|
||
def get_train_data(data,name):
|
||
# 特征列
|
||
featrue_columns = ['A', 'V', 'FC', 'C', 'H', 'N', 'S', 'O', 'H/C', 'O/C', 'N/C', 'Rt','Hr', 'dp', 'T']
|
||
columns = featrue_columns + [name]
|
||
print(columns)
|
||
data = data[columns]
|
||
train_x = data.drop(name, axis=1)
|
||
train_y = data[name]
|
||
return train_x,train_y
|
||
|
||
|
||
def get_valid_data(data,name):
|
||
# 特征列
|
||
featrue_columns = ['A', 'V', 'FC', 'C', 'H', 'N', 'S', 'O', 'H/C', 'O/C', 'N/C', 'Rt','Hr', 'dp', 'T']
|
||
columns = featrue_columns + [name]
|
||
data = data[columns]
|
||
valid_x = data.drop(name, axis=1)
|
||
valid_y = data[name]
|
||
return valid_x,valid_y
|
||
|
||
def evaluate_model_accuracy(predict,real):
|
||
predict = np.array(predict)
|
||
real = np.array(real)
|
||
# 计算 MAE
|
||
mae = mean_absolute_error(real, predict)
|
||
# 计算 MSE
|
||
mse = mean_squared_error(real, predict)
|
||
# 计算 RMSE
|
||
rmse = np.sqrt(mse)
|
||
# 计算 MAPE
|
||
mape = np.mean(np.abs((real - predict) / real)) * 100
|
||
# 计算 R²
|
||
r2 = r2_score(real, predict)
|
||
# 返回结果
|
||
return {
|
||
'MAE': mae,
|
||
'MSE': mse,
|
||
'RMSE': rmse,
|
||
'MAPE': mape,
|
||
'R_2': r2
|
||
}
|
||
|
||
def draw_picture(y_test,y_pred,model_acc,save_path,title):
|
||
plt.scatter(y_test, y_pred, c='blue', marker='o', label='Predicted vs Actual')
|
||
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2, label='y = x')
|
||
plt.xlabel('Actual values',fontsize=16)
|
||
plt.ylabel('Predicted values',fontsize=16)
|
||
plt.title(title)
|
||
plt.legend(loc='best')
|
||
metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
|
||
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
|
||
f"MAE: {round(model_acc['MAE'], 2)}\n"
|
||
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
|
||
f"R_square: {round(model_acc['R_2'], 2)}")
|
||
plt.metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
|
||
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
|
||
f"MAE: {round(model_acc['MAE'], 2)}\n"
|
||
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
|
||
f"R_square: {round(model_acc['R_2'], 2)}")
|
||
plt.text(0.75, 0.25, metrics_text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
|
||
|
||
# 获取当前图的边界
|
||
# xlim = plt.gca().get_xlim()
|
||
# ylim = plt.gca().get_ylim()
|
||
|
||
if not os.path.exists(save_path):
|
||
os.makedirs(save_path)
|
||
name = title + '.png'
|
||
path = os.path.join(save_path,name)
|
||
plt.savefig(path)
|
||
print(f"图形已保存到: {path}")
|
||
plt.show()
|
||
|
||
|
||
# 获取数据
|
||
path = "D:\\project\\ai_station\\meirejie\\data\\tar_data.csv"
|
||
train_data, valid_data = get_data(path)
|
||
|
||
train_x, train_y = get_train_data(train_data,'Tar')
|
||
valid_x, valid_y = get_valid_data(valid_data,'Tar')
|
||
|
||
print(valid_x.shape)
|
||
|
||
|
||
"""
|
||
线性回归
|
||
"""
|
||
# # 假设训练时的 Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()),
|
||
# ('regressor', LinearRegression())
|
||
# ])
|
||
# pipeline.fit(train_x, train_y)
|
||
|
||
# # 保存模型
|
||
# dump(pipeline, './model/tar_LinearRegression.joblib')
|
||
|
||
# # 预测
|
||
# pred_y = pipeline.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','LinearRegression')
|
||
# loaded_pipeline = load('./model/tar_LinearRegression.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
|
||
"""
|
||
岭回归
|
||
"""
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 数据标准化
|
||
# ('ridge', Ridge()) # 岭回归模型
|
||
# ])
|
||
|
||
# # 设置超参数网格
|
||
# param_grid = {
|
||
# 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100], # 正则化强度
|
||
# 'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sag'], # 求解器
|
||
# 'ridge__max_iter': [100, 500, 1000], # 最大迭代次数
|
||
# 'ridge__tol': [1e-4, 1e-3] # 收敛阈值
|
||
# }
|
||
# # 设置K折交叉验证
|
||
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 创建GridSearchCV对象
|
||
# grid_search = GridSearchCV(
|
||
# estimator=pipeline,
|
||
# param_grid=param_grid,
|
||
# cv=kfold,
|
||
# scoring='neg_mean_squared_error', # 使用负均方误差作为评分
|
||
# n_jobs=-1, # 使用所有可用的CPU核心
|
||
# verbose=1 # 显示详细过程
|
||
# )
|
||
|
||
# # 训练模型(自动进行超参数优化和交叉验证)
|
||
# print("开始训练和超参数优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_Ridge.joblib')
|
||
|
||
# # 预测
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','Ridge')
|
||
# loaded_pipeline = load('./model/tar_Ridge.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
|
||
"""
|
||
高斯回归
|
||
"""
|
||
|
||
# #定义更复杂的核函数组合
|
||
# base_kernels = [
|
||
# # RBF核 + 噪声
|
||
# ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(1e-5, (1e-8, 1e-1)),
|
||
# # Matern核(ν=1.5,适用于中等平滑数据)
|
||
# ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5) + WhiteKernel(1e-5, (1e-8, 1e-1)),
|
||
# # 组合核:RBF + 线性核
|
||
# ConstantKernel(1.0, (1e-3, 1e3)) * (RBF(1.0, (1e-2, 1e2)) + DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e2))) + WhiteKernel(1e-5, (1e-8, 1e-1))
|
||
# ]
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 可替换为MinMaxScaler()
|
||
# ('gpr', GaussianProcessRegressor(n_restarts_optimizer=20))
|
||
# ])
|
||
|
||
# # 定义超参数网格(更广的范围)
|
||
# param_grid = {
|
||
# 'gpr__kernel': base_kernels,
|
||
# 'gpr__alpha': [1e-6, 1e-5, 1e-4, 1e-3],
|
||
# 'gpr__normalize_y': [True, False]
|
||
# }
|
||
|
||
# # 设置K折交叉验证和网格搜索
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
# grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
|
||
# n_jobs=-1, verbose=2)
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索...")
|
||
# grid_search.fit(train_x, train_y)
|
||
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_GaussianProcessRegressor.joblib')
|
||
|
||
# # 预测 - 煤沥青
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','GaussianProcessRegressor')
|
||
# loaded_pipeline = load('./model/tar_GaussianProcessRegressor.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
|
||
"""
|
||
Lasso 回归
|
||
|
||
"""
|
||
# # 创建Pipeline(标准化 + Lasso)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 必须标准化,因为Lasso对尺度敏感
|
||
# ('lasso', Lasso(max_iter=10000)) # 增加迭代次数确保收敛
|
||
# ])
|
||
|
||
# # 4. 定义超参数网格
|
||
# param_grid = {
|
||
# 'lasso__alpha': np.logspace(-6, 3, 50), # 正则化系数范围:0.0001到100
|
||
# 'lasso__selection': ['cyclic', 'random'] # 优化算法选择
|
||
# }
|
||
|
||
# # 5. 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 6. 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 7. 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_Lasso.joblib')
|
||
|
||
# # 预测
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','Lasso')
|
||
# loaded_pipeline = load('./model/tar_Lasso.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
|
||
"""
|
||
ElasticNet
|
||
"""
|
||
# 创建Pipeline(标准化 + Lasso)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 必须标准化,因为Lasso对尺度敏感
|
||
# ('model', ElasticNet(max_iter=10000)) # 增加迭代次数确保收敛
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'model__alpha': np.logspace(-4, 2, 50),
|
||
# 'model__l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95] # 控制L1/L2混合比例
|
||
# }
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_ElasticNet.joblib')
|
||
|
||
# # 预测
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','ElasticNet')
|
||
# loaded_pipeline = load('./model/tar_ElasticNet.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
K近邻回归
|
||
"""
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # KNN对特征尺度敏感,必须标准化
|
||
# ('knn', KNeighborsRegressor())
|
||
# ])
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'knn__n_neighbors': np.arange(1, 4), # 最近邻的数量
|
||
# #'knn__weights': ['uniform', 'distance'], # 权重函数
|
||
# #'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # 计算算法
|
||
# # 'knn__leaf_size': np.arange(1, 50), # 叶子节点大小
|
||
# # 'knn__p':[1,2],
|
||
# #'knn__metric': ['euclidean', 'manhattan'] # 距离度量
|
||
# }
|
||
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_KNeighborsRegressor.joblib')
|
||
|
||
# # 预测 - 煤沥青
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','KNeighborsRegressor')
|
||
# loaded_pipeline = load('./model/tar_KNeighborsRegressor.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
SVR
|
||
"""
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 优先尝试StandardScaler
|
||
# ('svr', SVR(max_iter=10000)) # 增加迭代次数确保收敛
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], # 核函数类型
|
||
# #'svr__C': np.linspace(0.8,1.2,50), # np.linspace(0,0.2,50)
|
||
# # 'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 10)), # 核系数
|
||
# #'svr__epsilon': np.linspace(0.1,0.2,50) # 控制对噪声的容忍度
|
||
# }
|
||
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_SVR.joblib')
|
||
|
||
# # 预测 - 煤沥青
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','SVR')
|
||
# loaded_pipeline = load('./model/tar_SVR.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
决策树
|
||
"""
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 虽然决策树不需要,但保留以便比较
|
||
# ('dtr', DecisionTreeRegressor(random_state=42))
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'dtr__criterion': ['squared_error', 'friedman_mse'], # 分裂标准
|
||
# 'dtr__max_depth': np.arange(1,30) ,# [None, 3, 5, 7, 10, 15, 20], # 树的最大深度
|
||
# 'dtr__min_samples_split': np.arange(1,10), # 分裂所需最小样本数
|
||
# 'dtr__min_samples_leaf': np.arange(1,50), # 叶节点最小样本数
|
||
# # 'dtr__max_features': ['auto', 'sqrt', 'log2', None] # 考虑的特征数量
|
||
# }
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_DTR.joblib')
|
||
|
||
# # 预测
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','DTR')
|
||
# loaded_pipeline = load('./model/tar_DTR.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
随机森林
|
||
"""
|
||
# 创建Pipeline
|
||
pipeline = Pipeline([
|
||
('scaler', StandardScaler()), # 可选,随机森林对尺度不敏感
|
||
('rfr', RandomForestRegressor(random_state=42, n_jobs=-1))
|
||
])
|
||
|
||
# 定义超参数网格
|
||
param_grid = {
|
||
'rfr__n_estimators': np.arange(1, 50), # 树的数量
|
||
'rfr__max_depth': np.arange(1, 50), # 树的最大深度
|
||
# 'rfr__min_samples_split': [2, 5, 10], # 分裂所需最小样本数
|
||
# 'rfr__min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
|
||
# 'rfr__max_features': ['auto', 'sqrt', 'log2', 0.5, 0.8], # 考虑的特征比例
|
||
# 'rfr__bootstrap': [True, False] # 是否使用bootstrap采样
|
||
}
|
||
|
||
# 设置5折交叉验证
|
||
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# 网格搜索(优化超参数)
|
||
grid_search = GridSearchCV(
|
||
pipeline,
|
||
param_grid,
|
||
cv=kf,
|
||
scoring='neg_mean_squared_error', # 最小化MSE
|
||
n_jobs=-1, # 使用所有CPU核心
|
||
verbose=2 # 打印进度
|
||
)
|
||
|
||
# 训练模型
|
||
print("开始网格搜索优化...")
|
||
grid_search.fit(train_x, train_y)
|
||
print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# 使用最佳模型进行预测
|
||
best_model = grid_search.best_estimator_
|
||
|
||
# 保存模型
|
||
dump(best_model, './model/tar_RFR.joblib')
|
||
|
||
# 预测
|
||
pred_y = best_model.predict(valid_x)
|
||
acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
print(acc)
|
||
draw_picture(valid_y,pred_y,acc,'./pic/tar','RFR')
|
||
loaded_pipeline = load('./model/tar_RFR.joblib')
|
||
pred_y = loaded_pipeline.predict(valid_x)
|
||
print(pred_y)
|
||
|
||
|
||
|
||
"""
|
||
ADBT
|
||
"""
|
||
# # 创建Pipeline
|
||
base_estimator = DecisionTreeRegressor(max_depth=3)
|
||
pipeline = Pipeline([
|
||
('scaler', StandardScaler()),
|
||
('adb', AdaBoostRegressor(
|
||
estimator=base_estimator, # 正确传递基础估计器
|
||
random_state=42
|
||
))
|
||
])
|
||
|
||
|
||
# 定义超参数网格
|
||
param_grid = {
|
||
'adb__n_estimators': [50, 100, 200],
|
||
'adb__learning_rate': [0.01, 0.1, 0.5, 1.0],
|
||
'adb__loss': ['linear', 'square', 'exponential'],
|
||
# 通过estimator参数传递决策树深度
|
||
'adb__estimator': [DecisionTreeRegressor(max_depth=d) for d in [1, 2, 3, 4]]
|
||
}
|
||
|
||
# 设置5折交叉验证
|
||
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# 网格搜索(优化超参数)
|
||
grid_search = GridSearchCV(
|
||
pipeline,
|
||
param_grid,
|
||
cv=kf,
|
||
scoring='neg_mean_squared_error', # 最小化MSE
|
||
n_jobs=-1, # 使用所有CPU核心
|
||
verbose=2 # 打印进度
|
||
)
|
||
|
||
# 训练模型
|
||
print("开始网格搜索优化...")
|
||
grid_search.fit(train_x, train_y)
|
||
print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# 使用最佳模型进行预测
|
||
best_model = grid_search.best_estimator_
|
||
|
||
# 保存模型
|
||
dump(best_model, './model/tar_ADB.joblib')
|
||
|
||
# 预测 - 煤沥青
|
||
pred_y = best_model.predict(valid_x)
|
||
acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
print(acc)
|
||
draw_picture(valid_y,pred_y,acc,'./pic/tar','ADB')
|
||
loaded_pipeline = load('./model/tar_ADB.joblib')
|
||
pred_y = loaded_pipeline.predict(valid_x)
|
||
print(pred_y)
|
||
|
||
|
||
|
||
"""
|
||
XGB
|
||
"""
|
||
|
||
# 创建Pipeline
|
||
# base_estimator = DecisionTreeRegressor(max_depth=3)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 可选
|
||
# ('xgb', xgb.XGBRegressor(objective='reg:squarederror',
|
||
# random_state=42,
|
||
# n_jobs=-1))
|
||
# ])
|
||
|
||
|
||
# # 定义超参数网格
|
||
|
||
# param_grid = {
|
||
# # 'xgb__n_estimators': np.arange(50,150), # 树的数量
|
||
# 'xgb__min_child_weight':[9],
|
||
# 'xgb__max_depth': [8], # 树的最大深度
|
||
# #'xgb__learning_rate': [0.01, 0.05, 0.1], # 学习率
|
||
# 'xgb__subsample': [0.9], # 样本采样比例
|
||
# 'xgb__eta':[0.33],
|
||
# # 'xgb__colsample_bytree': [0.6, 0.8, 1.0], # 特征采样比例
|
||
# # 'xgb__gamma': [0, 0.1, 0.2], # 最小分裂损失
|
||
# # 'xgb__reg_alpha': [0, 0.1, 1], # L1正则化
|
||
# # 'xgb__reg_lambda': [0.1, 1, 10] # L2正则化
|
||
# }
|
||
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/tar_XGB.joblib')
|
||
|
||
# # 预测 - 煤沥青
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/tar','XGB')
|
||
# loaded_pipeline = load('./model/tar_XGB.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y) |