ai-station-code/meijitancailiao/meishuxing_SSA.py

734 lines
22 KiB
Python
Raw Permalink Normal View History

2025-05-06 11:18:48 +08:00
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold,GridSearchCV
import numpy as np
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
2025-05-14 11:00:24 +08:00
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor
2025-05-06 11:18:48 +08:00
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.linear_model import LinearRegression,Ridge,LogisticRegression, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel, WhiteKernel, DotProduct
from sklearn.feature_selection import SelectKBest, f_regression
import matplotlib.pyplot as plt
from joblib import dump, load
import os
def get_ssa_data():
train_data = pd.read_excel('./data/SSA.xlsx',sheet_name='sheet1')
valid_data = pd.read_excel('./data/SSA.xlsx',sheet_name='sheet2')
train_x = train_data.drop('SSA', axis=1)
train_y = train_data['SSA']
valid_x = valid_data.drop('SSA', axis=1)
valid_y = valid_data['SSA']
return train_x,train_y,valid_x,valid_y
def evaluate_model_accuracy(predict,real):
predict = np.array(predict)
real = np.array(real)
# 计算 MAE
mae = mean_absolute_error(real, predict)
# 计算 MSE
mse = mean_squared_error(real, predict)
# 计算 RMSE
rmse = np.sqrt(mse)
# 计算 MAPE
mape = np.mean(np.abs((real - predict) / real)) * 100
# 计算 R²
r2 = r2_score(real, predict)
# 返回结果
return {
'MAE': mae,
'MSE': mse,
'RMSE': rmse,
'MAPE': mape,
'R_2': r2
}
def draw_picture(y_test,y_pred,model_acc,save_path,title):
plt.scatter(y_test, y_pred, c='blue', marker='o', label='Predicted vs Actual')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2, label='y = x')
plt.xlabel('Actual values',fontsize=16)
plt.ylabel('Predicted values',fontsize=16)
plt.title(title)
plt.legend(loc='best')
metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
f"MAE: {round(model_acc['MAE'], 2)}\n"
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
f"R_square: {round(model_acc['R_2'], 2)}")
plt.metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
f"MAE: {round(model_acc['MAE'], 2)}\n"
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
f"R_square: {round(model_acc['R_2'], 2)}")
plt.text(0.75, 0.25, metrics_text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
# 获取当前图的边界
# xlim = plt.gca().get_xlim()
# ylim = plt.gca().get_ylim()
if not os.path.exists(save_path):
os.makedirs(save_path)
name = title + '.png'
path = os.path.join(save_path,name)
plt.savefig(path)
print(f"图形已保存到: {path}")
plt.show()
train_x,train_y,valid_x,valid_y = get_ssa_data()
"""
线性回归
"""
# # 假设训练时的 Pipeline
# pipeline = Pipeline([
# ('scaler', StandardScaler()),
# ('regressor', LinearRegression())
# ])
# pipeline.fit(train_x, train_y)
# # 保存模型
# dump(pipeline, './model/SSA_LinearRegression.joblib')
# # 预测
# pred_y = pipeline.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','LinearRegression')
# # 加载模型
# # loaded_pipeline = load('./model/SSA_LinearRegression.joblib')
# # # 使用模型预测
# # pred_y = loaded_pipeline.predict(valid_x)
# # print(pred_y)
"""
岭回归
"""
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 数据标准化
# ('ridge', Ridge()) # 岭回归模型
# ])
# # 设置超参数网格
# param_grid = {
# 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100], # 正则化强度
# 'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sag'], # 求解器
# 'ridge__max_iter': [100, 500, 1000], # 最大迭代次数
# 'ridge__tol': [1e-4, 1e-3] # 收敛阈值
# }
# # 设置K折交叉验证
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# # 创建GridSearchCV对象
# grid_search = GridSearchCV(
# estimator=pipeline,
# param_grid=param_grid,
# cv=kfold,
# scoring='neg_mean_squared_error', # 使用负均方误差作为评分
# n_jobs=-1, # 使用所有可用的CPU核心
# verbose=1 # 显示详细过程
# )
# # 训练模型(自动进行超参数优化和交叉验证)
# print("开始训练和超参数优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_Ridge.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','Ridge')
# # 加载模型
# loaded_pipeline = load('./model/SSA_Ridge.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
高斯回归
"""
#定义更复杂的核函数组合
# base_kernels = [
# # RBF核 + 噪声
# ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(1e-5, (1e-8, 1e-1)),
# # Matern核ν=1.5,适用于中等平滑数据)
# ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5) + WhiteKernel(1e-5, (1e-8, 1e-1)),
# # 组合核RBF + 线性核
# ConstantKernel(1.0, (1e-3, 1e3)) * (RBF(1.0, (1e-2, 1e2)) + DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e2))) + WhiteKernel(1e-5, (1e-8, 1e-1))
# ]
# # 创建Pipeline
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 可替换为MinMaxScaler()
# ('gpr', GaussianProcessRegressor(n_restarts_optimizer=20))
# ])
# # 定义超参数网格(更广的范围)
# param_grid = {
# 'gpr__kernel': base_kernels,
# 'gpr__alpha': [1e-6, 1e-5, 1e-4, 1e-3],
# 'gpr__normalize_y': [True, False]
# }
# # 设置K折交叉验证和网格搜索
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
# n_jobs=-1, verbose=2)
# # 训练模型
# print("开始网格搜索...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_GaussianProcessRegressor.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','GaussianProcessRegressor')
# # 加载模型
# loaded_pipeline = load('./model/SSA_GaussianProcessRegressor.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
Lasso 回归
"""
# # 创建Pipeline标准化 + Lasso
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 必须标准化因为Lasso对尺度敏感
# ('lasso', Lasso(max_iter=10000)) # 增加迭代次数确保收敛
# ])
# # 4. 定义超参数网格
# param_grid = {
# 'lasso__alpha': np.logspace(-6, 3, 50), # 正则化系数范围0.0001到100
# 'lasso__selection': ['cyclic', 'random'] # 优化算法选择
# }
# # 5. 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 6. 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 7. 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_Lasso.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','Lasso')
# # 加载模型
# loaded_pipeline = load('./model/SSA_Lasso.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
ElasticNet
"""
# # 创建Pipeline标准化 + Lasso
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 必须标准化因为Lasso对尺度敏感
# ('model', ElasticNet(max_iter=10000)) # 增加迭代次数确保收敛
# ])
# # 定义超参数网格
# param_grid = {
# 'model__alpha': np.logspace(-4, 2, 50),
# 'model__l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95] # 控制L1/L2混合比例
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_ElasticNet.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','ElasticNet')
# # 加载模型
# loaded_pipeline = load('./model/SSA_ElasticNet.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
K近邻回归
"""
# # 创建Pipeline
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # KNN对特征尺度敏感必须标准化
# ('knn', KNeighborsRegressor())
# ])
# # 定义超参数网格
# param_grid = {
# 'knn__n_neighbors': np.arange(1, 20), # 最近邻的数量
# # 'knn__weights': ['uniform', 'distance'], # 权重函数
# 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # 计算算法
# # 'knn__leaf_size': np.arange(1, 20), # 叶子节点大小
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_KNeighborsRegressor.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','KNeighborsRegressor')
# # 加载模型
# loaded_pipeline = load('./model/SSA_KNeighborsRegressor.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
SVR
"""
# # 创建Pipeline
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 优先尝试StandardScaler
# ('svr', SVR(max_iter=10000)) # 增加迭代次数确保收敛
# ])
# # 定义超参数网格
# param_grid = {
# 'svr__kernel': ['rbf', 'poly', 'sigmoid'], # 核函数类型
# 'svr__C': np.logspace(-2, 4, 20), # 正则化参数0.01到10000
# 'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 10)), # 核系数
# 'svr__epsilon': [0.01, 0.1, 0.5] # 控制对噪声的容忍度
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_SVR.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','SVR')
# # 加载模型
# loaded_pipeline = load('./model/SSA_SVR.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
决策树
"""
# # 创建Pipeline
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 虽然决策树不需要,但保留以便比较
# ('dtr', DecisionTreeRegressor(random_state=42))
# ])
# # 定义超参数网格
# param_grid = {
# 'dtr__criterion': ['squared_error', 'friedman_mse'], # 分裂标准
# 'dtr__max_depth': [None, 3, 5, 7, 10, 15, 20], # 树的最大深度
# 'dtr__min_samples_split': [2, 5, 10], # 分裂所需最小样本数
# 'dtr__min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
# 'dtr__max_features': ['auto', 'sqrt', 'log2', None] # 考虑的特征数量
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_DTR.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','DTR')
# # 加载模型
# loaded_pipeline = load('./model/SSA_DTR.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
随机森林
"""
# # 创建Pipeline
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 可选,随机森林对尺度不敏感
# ('rfr', RandomForestRegressor(random_state=42, n_jobs=-1))
# ])
# # 定义超参数网格
# param_grid = {
# 'rfr__n_estimators': np.arange(1, 50), # 树的数量
# 'rfr__max_depth': np.arange(1, 50), # 树的最大深度
# # 'rfr__min_samples_split': [2, 5, 10], # 分裂所需最小样本数
# # 'rfr__min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
# # 'rfr__max_features': ['auto', 'sqrt', 'log2', 0.5, 0.8], # 考虑的特征比例
# # 'rfr__bootstrap': [True, False] # 是否使用bootstrap采样
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_RFR.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','RFR')
# # 加载模型
# loaded_pipeline = load('./model/SSA_RFR.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
ADBT
"""
# # 创建Pipeline
# base_estimator = DecisionTreeRegressor(max_depth=3)
# pipeline = Pipeline([
# ('scaler', StandardScaler()),
# ('adb', AdaBoostRegressor(
# estimator=base_estimator, # 正确传递基础估计器
# random_state=42
# ))
# ])
# # 定义超参数网格
# param_grid = {
# 'adb__n_estimators': [50, 100, 200],
# 'adb__learning_rate': [0.01, 0.1, 0.5, 1.0],
# 'adb__loss': ['linear', 'square', 'exponential'],
# # 通过estimator参数传递决策树深度
# 'adb__estimator': [DecisionTreeRegressor(max_depth=d) for d in [1, 2, 3, 4]]
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_ADB.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','ADB')
# # 加载模型
# loaded_pipeline = load('./model/SSA_ADB.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""
XGB
"""
# 创建Pipeline
2025-05-14 11:00:24 +08:00
# base_estimator = DecisionTreeRegressor(max_depth=3)
# pipeline = Pipeline([
# ('scaler', StandardScaler()), # 可选
# ('xgb', xgb.XGBRegressor(objective='reg:squarederror',
# random_state=42,
# n_jobs=-1))
# ])
# # 定义超参数网格
# param_grid = {
# 'xgb__n_estimators': np.arange(1,150), # 树的数量
# # 'xgb__max_depth': [3, 4, 5], # 树的最大深度
# # 'xgb__learning_rate': [0.01, 0.05, 0.1], # 学习率
# # 'xgb__subsample': [0.6, 0.8, 1.0], # 样本采样比例
# # 'xgb__colsample_bytree': [0.6, 0.8, 1.0], # 特征采样比例
# # 'xgb__gamma': [0, 0.1, 0.2], # 最小分裂损失
# # 'xgb__reg_alpha': [0, 0.1, 1], # L1正则化
# # 'xgb__reg_lambda': [0.1, 1, 10] # L2正则化
# }
# # 设置5折交叉验证
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # 网格搜索(优化超参数)
# grid_search = GridSearchCV(
# pipeline,
# param_grid,
# cv=kf,
# scoring='neg_mean_squared_error', # 最小化MSE
# n_jobs=-1, # 使用所有CPU核心
# verbose=2 # 打印进度
# )
# # 训练模型
# print("开始网格搜索优化...")
# grid_search.fit(train_x, train_y)
# print("\n最佳参数组合:", grid_search.best_params_)
# # 使用最佳模型进行预测
# best_model = grid_search.best_estimator_
# # 保存模型
# dump(best_model, './model/SSA_XGB.joblib')
# pred_y = best_model.predict(valid_x)
# # 验证
# acc = evaluate_model_accuracy(pred_y,valid_y)
# print(acc)
# # 画图保存
# draw_picture(valid_y,pred_y,acc,'./pic/ssa','XGB')
# # 加载模型
# loaded_pipeline = load('./model/SSA_XGB.joblib')
# # 使用模型预测
# pred_y = loaded_pipeline.predict(valid_x)
# print(pred_y)
"""GBDT"""
2025-05-06 11:18:48 +08:00
pipeline = Pipeline([
2025-05-14 11:00:24 +08:00
('scaler', StandardScaler()), # 可替换为MinMaxScaler()
('gbdt', GradientBoostingRegressor(random_state=42))
])
2025-05-06 11:18:48 +08:00
param_grid = {
2025-05-14 11:00:24 +08:00
'gbdt__n_estimators': [20, 30, 40, 50, 60, 70, 80,100], # 树的数量
'gbdt__learning_rate': [0.01, 0.1, 0.2], # 学习率
'gbdt__max_depth': [3, 5, 7], # 树的最大深度
'gbdt__min_samples_split': [2, 5], # 分裂所需最小样本数
'gbdt__min_samples_leaf': [1, 2,3,4,5], # 叶节点最小样本数
'gbdt__max_features': ['sqrt', 'log2'], # 特征选择方式
2025-05-06 11:18:48 +08:00
}
2025-05-14 11:00:24 +08:00
# 设置K折交叉验证和网格搜索
2025-05-06 11:18:48 +08:00
kf = KFold(n_splits=5, shuffle=True, random_state=42)
2025-05-14 11:00:24 +08:00
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
n_jobs=-1, verbose=2)
2025-05-06 11:18:48 +08:00
# 训练模型
2025-05-14 11:00:24 +08:00
print("开始网格搜索...")
2025-05-06 11:18:48 +08:00
grid_search.fit(train_x, train_y)
2025-05-14 11:00:24 +08:00
2025-05-06 11:18:48 +08:00
print("\n最佳参数组合:", grid_search.best_params_)
2025-05-14 11:00:24 +08:00
#使用最佳模型进行预测
2025-05-06 11:18:48 +08:00
best_model = grid_search.best_estimator_
2025-05-14 11:00:24 +08:00
dump(best_model, './model/SSA_GDBT.joblib')
2025-05-06 11:18:48 +08:00
pred_y = best_model.predict(valid_x)
acc = evaluate_model_accuracy(pred_y,valid_y)
2025-05-14 11:00:24 +08:00
print(pred_y)
print("\n")
2025-05-06 11:18:48 +08:00
print(acc)
2025-05-14 11:00:24 +08:00
draw_picture(valid_y,pred_y,acc,'./pic/SSA','GDBT')
loaded_pipeline = load('./model/SSA_GDBT.joblib')
2025-05-06 11:18:48 +08:00
pred_y = loaded_pipeline.predict(valid_x)
print(pred_y)