2025-05-06 11:18:48 +08:00
|
|
|
|
import pandas as pd
|
|
|
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
|
|
|
|
|
from sklearn.model_selection import train_test_split, cross_val_score, KFold,GridSearchCV
|
|
|
|
|
import numpy as np
|
|
|
|
|
import xgboost as xgb
|
|
|
|
|
from sklearn.svm import SVR
|
|
|
|
|
from sklearn.tree import DecisionTreeRegressor
|
2025-05-14 11:00:24 +08:00
|
|
|
|
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor
|
2025-05-06 11:18:48 +08:00
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
|
|
|
|
|
from sklearn.linear_model import LinearRegression,Ridge,LogisticRegression, Lasso, ElasticNet
|
|
|
|
|
from sklearn.neighbors import KNeighborsRegressor
|
|
|
|
|
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
|
|
|
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel, WhiteKernel, DotProduct
|
|
|
|
|
from sklearn.feature_selection import SelectKBest, f_regression
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from joblib import dump, load
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_ssa_data():
|
|
|
|
|
train_data = pd.read_excel('./data/SSA.xlsx',sheet_name='sheet1')
|
|
|
|
|
valid_data = pd.read_excel('./data/SSA.xlsx',sheet_name='sheet2')
|
|
|
|
|
train_x = train_data.drop('SSA', axis=1)
|
|
|
|
|
train_y = train_data['SSA']
|
|
|
|
|
valid_x = valid_data.drop('SSA', axis=1)
|
|
|
|
|
valid_y = valid_data['SSA']
|
|
|
|
|
return train_x,train_y,valid_x,valid_y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def evaluate_model_accuracy(predict,real):
|
|
|
|
|
predict = np.array(predict)
|
|
|
|
|
real = np.array(real)
|
|
|
|
|
# 计算 MAE
|
|
|
|
|
mae = mean_absolute_error(real, predict)
|
|
|
|
|
# 计算 MSE
|
|
|
|
|
mse = mean_squared_error(real, predict)
|
|
|
|
|
# 计算 RMSE
|
|
|
|
|
rmse = np.sqrt(mse)
|
|
|
|
|
# 计算 MAPE
|
|
|
|
|
mape = np.mean(np.abs((real - predict) / real)) * 100
|
|
|
|
|
# 计算 R²
|
|
|
|
|
r2 = r2_score(real, predict)
|
|
|
|
|
# 返回结果
|
|
|
|
|
return {
|
|
|
|
|
'MAE': mae,
|
|
|
|
|
'MSE': mse,
|
|
|
|
|
'RMSE': rmse,
|
|
|
|
|
'MAPE': mape,
|
|
|
|
|
'R_2': r2
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def draw_picture(y_test,y_pred,model_acc,save_path,title):
|
|
|
|
|
plt.scatter(y_test, y_pred, c='blue', marker='o', label='Predicted vs Actual')
|
|
|
|
|
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2, label='y = x')
|
|
|
|
|
plt.xlabel('Actual values',fontsize=16)
|
|
|
|
|
plt.ylabel('Predicted values',fontsize=16)
|
|
|
|
|
plt.title(title)
|
|
|
|
|
plt.legend(loc='best')
|
|
|
|
|
metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
|
|
|
|
|
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
|
|
|
|
|
f"MAE: {round(model_acc['MAE'], 2)}\n"
|
|
|
|
|
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
|
|
|
|
|
f"R_square: {round(model_acc['R_2'], 2)}")
|
|
|
|
|
plt.metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
|
|
|
|
|
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
|
|
|
|
|
f"MAE: {round(model_acc['MAE'], 2)}\n"
|
|
|
|
|
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
|
|
|
|
|
f"R_square: {round(model_acc['R_2'], 2)}")
|
|
|
|
|
plt.text(0.75, 0.25, metrics_text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
|
|
|
|
|
|
|
|
|
|
# 获取当前图的边界
|
|
|
|
|
# xlim = plt.gca().get_xlim()
|
|
|
|
|
# ylim = plt.gca().get_ylim()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(save_path):
|
|
|
|
|
os.makedirs(save_path)
|
|
|
|
|
name = title + '.png'
|
|
|
|
|
path = os.path.join(save_path,name)
|
|
|
|
|
plt.savefig(path)
|
|
|
|
|
print(f"图形已保存到: {path}")
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_x,train_y,valid_x,valid_y = get_ssa_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
线性回归
|
|
|
|
|
"""
|
|
|
|
|
# # 假设训练时的 Pipeline
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()),
|
|
|
|
|
# ('regressor', LinearRegression())
|
|
|
|
|
# ])
|
|
|
|
|
# pipeline.fit(train_x, train_y)
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(pipeline, './model/SSA_LinearRegression.joblib')
|
|
|
|
|
# # 预测
|
|
|
|
|
# pred_y = pipeline.predict(valid_x)
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','LinearRegression')
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# # loaded_pipeline = load('./model/SSA_LinearRegression.joblib')
|
|
|
|
|
# # # 使用模型预测
|
|
|
|
|
# # pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# # print(pred_y)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
岭回归
|
|
|
|
|
"""
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 数据标准化
|
|
|
|
|
# ('ridge', Ridge()) # 岭回归模型
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 设置超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100], # 正则化强度
|
|
|
|
|
# 'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sag'], # 求解器
|
|
|
|
|
# 'ridge__max_iter': [100, 500, 1000], # 最大迭代次数
|
|
|
|
|
# 'ridge__tol': [1e-4, 1e-3] # 收敛阈值
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置K折交叉验证
|
|
|
|
|
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 创建GridSearchCV对象
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# estimator=pipeline,
|
|
|
|
|
# param_grid=param_grid,
|
|
|
|
|
# cv=kfold,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 使用负均方误差作为评分
|
|
|
|
|
# n_jobs=-1, # 使用所有可用的CPU核心
|
|
|
|
|
# verbose=1 # 显示详细过程
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型(自动进行超参数优化和交叉验证)
|
|
|
|
|
# print("开始训练和超参数优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_Ridge.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','Ridge')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_Ridge.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
高斯回归
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
#定义更复杂的核函数组合
|
|
|
|
|
# base_kernels = [
|
|
|
|
|
# # RBF核 + 噪声
|
|
|
|
|
# ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(1e-5, (1e-8, 1e-1)),
|
|
|
|
|
# # Matern核(ν=1.5,适用于中等平滑数据)
|
|
|
|
|
# ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5) + WhiteKernel(1e-5, (1e-8, 1e-1)),
|
|
|
|
|
# # 组合核:RBF + 线性核
|
|
|
|
|
# ConstantKernel(1.0, (1e-3, 1e3)) * (RBF(1.0, (1e-2, 1e2)) + DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e2))) + WhiteKernel(1e-5, (1e-8, 1e-1))
|
|
|
|
|
# ]
|
|
|
|
|
# # 创建Pipeline
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 可替换为MinMaxScaler()
|
|
|
|
|
# ('gpr', GaussianProcessRegressor(n_restarts_optimizer=20))
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 定义超参数网格(更广的范围)
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'gpr__kernel': base_kernels,
|
|
|
|
|
# 'gpr__alpha': [1e-6, 1e-5, 1e-4, 1e-3],
|
|
|
|
|
# 'gpr__normalize_y': [True, False]
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置K折交叉验证和网格搜索
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
# grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
|
|
|
|
|
# n_jobs=-1, verbose=2)
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_GaussianProcessRegressor.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','GaussianProcessRegressor')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_GaussianProcessRegressor.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Lasso 回归
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline(标准化 + Lasso)
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 必须标准化,因为Lasso对尺度敏感
|
|
|
|
|
# ('lasso', Lasso(max_iter=10000)) # 增加迭代次数确保收敛
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 4. 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'lasso__alpha': np.logspace(-6, 3, 50), # 正则化系数范围:0.0001到100
|
|
|
|
|
# 'lasso__selection': ['cyclic', 'random'] # 优化算法选择
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 5. 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 6. 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 7. 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_Lasso.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','Lasso')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_Lasso.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
ElasticNet
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline(标准化 + Lasso)
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 必须标准化,因为Lasso对尺度敏感
|
|
|
|
|
# ('model', ElasticNet(max_iter=10000)) # 增加迭代次数确保收敛
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'model__alpha': np.logspace(-4, 2, 50),
|
|
|
|
|
# 'model__l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95] # 控制L1/L2混合比例
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_ElasticNet.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','ElasticNet')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_ElasticNet.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
K近邻回归
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # KNN对特征尺度敏感,必须标准化
|
|
|
|
|
# ('knn', KNeighborsRegressor())
|
|
|
|
|
# ])
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'knn__n_neighbors': np.arange(1, 20), # 最近邻的数量
|
|
|
|
|
# # 'knn__weights': ['uniform', 'distance'], # 权重函数
|
|
|
|
|
# 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # 计算算法
|
|
|
|
|
# # 'knn__leaf_size': np.arange(1, 20), # 叶子节点大小
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_KNeighborsRegressor.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','KNeighborsRegressor')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_KNeighborsRegressor.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
SVR
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 优先尝试StandardScaler
|
|
|
|
|
# ('svr', SVR(max_iter=10000)) # 增加迭代次数确保收敛
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'svr__kernel': ['rbf', 'poly', 'sigmoid'], # 核函数类型
|
|
|
|
|
# 'svr__C': np.logspace(-2, 4, 20), # 正则化参数(0.01到10000)
|
|
|
|
|
# 'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 10)), # 核系数
|
|
|
|
|
# 'svr__epsilon': [0.01, 0.1, 0.5] # 控制对噪声的容忍度
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_SVR.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','SVR')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_SVR.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
决策树
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 虽然决策树不需要,但保留以便比较
|
|
|
|
|
# ('dtr', DecisionTreeRegressor(random_state=42))
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'dtr__criterion': ['squared_error', 'friedman_mse'], # 分裂标准
|
|
|
|
|
# 'dtr__max_depth': [None, 3, 5, 7, 10, 15, 20], # 树的最大深度
|
|
|
|
|
# 'dtr__min_samples_split': [2, 5, 10], # 分裂所需最小样本数
|
|
|
|
|
# 'dtr__min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
|
|
|
|
|
# 'dtr__max_features': ['auto', 'sqrt', 'log2', None] # 考虑的特征数量
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_DTR.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','DTR')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_DTR.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
随机森林
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 可选,随机森林对尺度不敏感
|
|
|
|
|
# ('rfr', RandomForestRegressor(random_state=42, n_jobs=-1))
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'rfr__n_estimators': np.arange(1, 50), # 树的数量
|
|
|
|
|
# 'rfr__max_depth': np.arange(1, 50), # 树的最大深度
|
|
|
|
|
# # 'rfr__min_samples_split': [2, 5, 10], # 分裂所需最小样本数
|
|
|
|
|
# # 'rfr__min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
|
|
|
|
|
# # 'rfr__max_features': ['auto', 'sqrt', 'log2', 0.5, 0.8], # 考虑的特征比例
|
|
|
|
|
# # 'rfr__bootstrap': [True, False] # 是否使用bootstrap采样
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_RFR.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','RFR')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_RFR.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
ADBT
|
|
|
|
|
"""
|
|
|
|
|
# # 创建Pipeline
|
|
|
|
|
# base_estimator = DecisionTreeRegressor(max_depth=3)
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()),
|
|
|
|
|
# ('adb', AdaBoostRegressor(
|
|
|
|
|
# estimator=base_estimator, # 正确传递基础估计器
|
|
|
|
|
# random_state=42
|
|
|
|
|
# ))
|
|
|
|
|
# ])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'adb__n_estimators': [50, 100, 200],
|
|
|
|
|
# 'adb__learning_rate': [0.01, 0.1, 0.5, 1.0],
|
|
|
|
|
# 'adb__loss': ['linear', 'square', 'exponential'],
|
|
|
|
|
# # 通过estimator参数传递决策树深度
|
|
|
|
|
# 'adb__estimator': [DecisionTreeRegressor(max_depth=d) for d in [1, 2, 3, 4]]
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_ADB.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','ADB')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_ADB.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
"""
|
|
|
|
|
XGB
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# 创建Pipeline
|
2025-05-14 11:00:24 +08:00
|
|
|
|
# base_estimator = DecisionTreeRegressor(max_depth=3)
|
|
|
|
|
# pipeline = Pipeline([
|
|
|
|
|
# ('scaler', StandardScaler()), # 可选
|
|
|
|
|
# ('xgb', xgb.XGBRegressor(objective='reg:squarederror',
|
|
|
|
|
# random_state=42,
|
|
|
|
|
# n_jobs=-1))
|
|
|
|
|
# ])
|
|
|
|
|
# # 定义超参数网格
|
|
|
|
|
# param_grid = {
|
|
|
|
|
# 'xgb__n_estimators': np.arange(1,150), # 树的数量
|
|
|
|
|
# # 'xgb__max_depth': [3, 4, 5], # 树的最大深度
|
|
|
|
|
# # 'xgb__learning_rate': [0.01, 0.05, 0.1], # 学习率
|
|
|
|
|
# # 'xgb__subsample': [0.6, 0.8, 1.0], # 样本采样比例
|
|
|
|
|
# # 'xgb__colsample_bytree': [0.6, 0.8, 1.0], # 特征采样比例
|
|
|
|
|
# # 'xgb__gamma': [0, 0.1, 0.2], # 最小分裂损失
|
|
|
|
|
# # 'xgb__reg_alpha': [0, 0.1, 1], # L1正则化
|
|
|
|
|
# # 'xgb__reg_lambda': [0.1, 1, 10] # L2正则化
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# # 设置5折交叉验证
|
|
|
|
|
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
|
|
|
|
|
# # 网格搜索(优化超参数)
|
|
|
|
|
# grid_search = GridSearchCV(
|
|
|
|
|
# pipeline,
|
|
|
|
|
# param_grid,
|
|
|
|
|
# cv=kf,
|
|
|
|
|
# scoring='neg_mean_squared_error', # 最小化MSE
|
|
|
|
|
# n_jobs=-1, # 使用所有CPU核心
|
|
|
|
|
# verbose=2 # 打印进度
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
# # 训练模型
|
|
|
|
|
# print("开始网格搜索优化...")
|
|
|
|
|
# grid_search.fit(train_x, train_y)
|
|
|
|
|
# print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
|
|
|
|
# # 使用最佳模型进行预测
|
|
|
|
|
# best_model = grid_search.best_estimator_
|
|
|
|
|
|
|
|
|
|
# # 保存模型
|
|
|
|
|
# dump(best_model, './model/SSA_XGB.joblib')
|
|
|
|
|
|
|
|
|
|
# pred_y = best_model.predict(valid_x)
|
|
|
|
|
|
|
|
|
|
# # 验证
|
|
|
|
|
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
|
|
|
|
# print(acc)
|
|
|
|
|
# # 画图保存
|
|
|
|
|
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','XGB')
|
|
|
|
|
|
|
|
|
|
# # 加载模型
|
|
|
|
|
# loaded_pipeline = load('./model/SSA_XGB.joblib')
|
|
|
|
|
|
|
|
|
|
# # 使用模型预测
|
|
|
|
|
# pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
# print(pred_y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""GBDT"""
|
2025-05-06 11:18:48 +08:00
|
|
|
|
pipeline = Pipeline([
|
2025-05-14 11:00:24 +08:00
|
|
|
|
('scaler', StandardScaler()), # 可替换为MinMaxScaler()
|
|
|
|
|
('gbdt', GradientBoostingRegressor(random_state=42))
|
|
|
|
|
])
|
2025-05-06 11:18:48 +08:00
|
|
|
|
param_grid = {
|
2025-05-14 11:00:24 +08:00
|
|
|
|
'gbdt__n_estimators': [20, 30, 40, 50, 60, 70, 80,100], # 树的数量
|
|
|
|
|
'gbdt__learning_rate': [0.01, 0.1, 0.2], # 学习率
|
|
|
|
|
'gbdt__max_depth': [3, 5, 7], # 树的最大深度
|
|
|
|
|
'gbdt__min_samples_split': [2, 5], # 分裂所需最小样本数
|
|
|
|
|
'gbdt__min_samples_leaf': [1, 2,3,4,5], # 叶节点最小样本数
|
|
|
|
|
'gbdt__max_features': ['sqrt', 'log2'], # 特征选择方式
|
2025-05-06 11:18:48 +08:00
|
|
|
|
}
|
2025-05-14 11:00:24 +08:00
|
|
|
|
# 设置K折交叉验证和网格搜索
|
2025-05-06 11:18:48 +08:00
|
|
|
|
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
2025-05-14 11:00:24 +08:00
|
|
|
|
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
|
|
|
|
|
n_jobs=-1, verbose=2)
|
2025-05-06 11:18:48 +08:00
|
|
|
|
|
|
|
|
|
# 训练模型
|
2025-05-14 11:00:24 +08:00
|
|
|
|
print("开始网格搜索...")
|
2025-05-06 11:18:48 +08:00
|
|
|
|
grid_search.fit(train_x, train_y)
|
2025-05-14 11:00:24 +08:00
|
|
|
|
|
2025-05-06 11:18:48 +08:00
|
|
|
|
print("\n最佳参数组合:", grid_search.best_params_)
|
|
|
|
|
|
2025-05-14 11:00:24 +08:00
|
|
|
|
#使用最佳模型进行预测
|
2025-05-06 11:18:48 +08:00
|
|
|
|
best_model = grid_search.best_estimator_
|
|
|
|
|
|
2025-05-14 11:00:24 +08:00
|
|
|
|
dump(best_model, './model/SSA_GDBT.joblib')
|
2025-05-06 11:18:48 +08:00
|
|
|
|
|
|
|
|
|
pred_y = best_model.predict(valid_x)
|
|
|
|
|
acc = evaluate_model_accuracy(pred_y,valid_y)
|
2025-05-14 11:00:24 +08:00
|
|
|
|
print(pred_y)
|
|
|
|
|
print("\n")
|
2025-05-06 11:18:48 +08:00
|
|
|
|
print(acc)
|
2025-05-14 11:00:24 +08:00
|
|
|
|
draw_picture(valid_y,pred_y,acc,'./pic/SSA','GDBT')
|
|
|
|
|
loaded_pipeline = load('./model/SSA_GDBT.joblib')
|
2025-05-06 11:18:48 +08:00
|
|
|
|
pred_y = loaded_pipeline.predict(valid_x)
|
|
|
|
|
print(pred_y)
|