742 lines
22 KiB
Python
742 lines
22 KiB
Python
import pandas as pd
|
||
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
|
||
from sklearn.model_selection import train_test_split, cross_val_score, KFold,GridSearchCV
|
||
import numpy as np
|
||
import xgboost as xgb
|
||
from sklearn.svm import SVR
|
||
from sklearn.tree import DecisionTreeRegressor
|
||
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
|
||
from sklearn.linear_model import LinearRegression,Ridge,LogisticRegression, Lasso, ElasticNet
|
||
from sklearn.neighbors import KNeighborsRegressor
|
||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel, WhiteKernel, DotProduct
|
||
from sklearn.feature_selection import SelectKBest, f_regression
|
||
import matplotlib.pyplot as plt
|
||
from joblib import dump, load
|
||
import os
|
||
|
||
|
||
|
||
def get_TPV_data():
|
||
train_data = pd.read_excel('./data/TPV.xlsx',sheet_name='sheet1')
|
||
valid_data = pd.read_excel('./data/TPV.xlsx',sheet_name='sheet2')
|
||
train_x = train_data.drop('TPV', axis=1)
|
||
train_y = train_data['TPV']
|
||
valid_x = valid_data.drop('TPV', axis=1)
|
||
valid_y = valid_data['TPV']
|
||
return train_x,train_y,valid_x,valid_y
|
||
|
||
|
||
|
||
def evaluate_model_accuracy(predict,real):
|
||
predict = np.array(predict)
|
||
real = np.array(real)
|
||
# 计算 MAE
|
||
mae = mean_absolute_error(real, predict)
|
||
# 计算 MSE
|
||
mse = mean_squared_error(real, predict)
|
||
# 计算 RMSE
|
||
rmse = np.sqrt(mse)
|
||
# 计算 MAPE
|
||
mape = np.mean(np.abs((real - predict) / real)) * 100
|
||
# 计算 R²
|
||
r2 = r2_score(real, predict)
|
||
# 返回结果
|
||
return {
|
||
'MAE': mae,
|
||
'MSE': mse,
|
||
'RMSE': rmse,
|
||
'MAPE': mape,
|
||
'R_2': r2
|
||
}
|
||
|
||
|
||
def draw_picture(y_test,y_pred,model_acc,save_path,title):
|
||
plt.scatter(y_test, y_pred, c='blue', marker='o', label='Predicted vs Actual')
|
||
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', lw=2, label='y = x')
|
||
plt.xlabel('Actual values',fontsize=16)
|
||
plt.ylabel('Predicted values',fontsize=16)
|
||
plt.title(title)
|
||
plt.legend(loc='best')
|
||
metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
|
||
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
|
||
f"MAE: {round(model_acc['MAE'], 2)}\n"
|
||
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
|
||
f"R_square: {round(model_acc['R_2'], 2)}")
|
||
plt.metrics_text = (f"MSE: {round(model_acc['MSE'], 2)}\n"
|
||
f"RMSE: {round(model_acc['RMSE'], 2)}\n"
|
||
f"MAE: {round(model_acc['MAE'], 2)}\n"
|
||
f"MAPE: {round(model_acc['MAPE'], 2)}%\n"
|
||
f"R_square: {round(model_acc['R_2'], 2)}")
|
||
plt.text(0.75, 0.25, metrics_text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
|
||
|
||
# 获取当前图的边界
|
||
# xlim = plt.gca().get_xlim()
|
||
# ylim = plt.gca().get_ylim()
|
||
|
||
|
||
if not os.path.exists(save_path):
|
||
os.makedirs(save_path)
|
||
name = title + '.png'
|
||
path = os.path.join(save_path,name)
|
||
plt.savefig(path)
|
||
print(f"图形已保存到: {path}")
|
||
plt.show()
|
||
|
||
|
||
train_x,train_y,valid_x,valid_y = get_TPV_data()
|
||
|
||
|
||
"""
|
||
线性回归
|
||
"""
|
||
# # 假设训练时的 Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()),
|
||
# ('regressor', LinearRegression())
|
||
# ])
|
||
# pipeline.fit(train_x, train_y)
|
||
|
||
# # 保存模型
|
||
# dump(pipeline, './model/TPV_LinearRegression.joblib')
|
||
# # 预测
|
||
# pred_y = pipeline.predict(valid_x)
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','LinearRegression')
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_LinearRegression.joblib')
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
"""
|
||
岭回归
|
||
"""
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 数据标准化
|
||
# ('ridge', Ridge()) # 岭回归模型
|
||
# ])
|
||
|
||
# # 设置超参数网格
|
||
# param_grid = {
|
||
# 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100], # 正则化强度
|
||
# 'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sag'], # 求解器
|
||
# 'ridge__max_iter': [100, 500, 1000], # 最大迭代次数
|
||
# 'ridge__tol': [1e-4, 1e-3] # 收敛阈值
|
||
# }
|
||
|
||
# # 设置K折交叉验证
|
||
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 创建GridSearchCV对象
|
||
# grid_search = GridSearchCV(
|
||
# estimator=pipeline,
|
||
# param_grid=param_grid,
|
||
# cv=kfold,
|
||
# scoring='neg_mean_squared_error', # 使用负均方误差作为评分
|
||
# n_jobs=-1, # 使用所有可用的CPU核心
|
||
# verbose=1 # 显示详细过程
|
||
# )
|
||
|
||
# # 训练模型(自动进行超参数优化和交叉验证)
|
||
# print("开始训练和超参数优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_Ridge.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','Ridge')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_Ridge.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
高斯回归
|
||
"""
|
||
|
||
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 可替换为MinMaxScaler()
|
||
# ('gpr', GaussianProcessRegressor(n_restarts_optimizer=20))
|
||
# ])
|
||
|
||
# #定义更复杂的核函数组合
|
||
# kernels = [
|
||
# ConstantKernel() * RBF(),
|
||
# ConstantKernel() * Matern(),
|
||
# ConstantKernel() * RBF() + ConstantKernel() * RBF(),
|
||
# ]
|
||
|
||
# # 定义超参数网格(更广的范围)
|
||
# param_grid = {
|
||
# 'gpr__kernel': kernels,
|
||
# 'gpr__alpha': [1e-10, 1e-8, 1e-6, 1e-4],
|
||
# 'gpr__n_restarts_optimizer': [3, 5, 10],
|
||
# }
|
||
|
||
# # 设置K折交叉验证和网格搜索
|
||
# kf = KFold(n_splits=10, shuffle=True, random_state=42)
|
||
# grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
|
||
# n_jobs=-1, verbose=2)
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索...")
|
||
# grid_search.fit(train_x, train_y)
|
||
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_GaussianProcessRegressor.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','GaussianProcessRegressor')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_GaussianProcessRegressor.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
Lasso 回归
|
||
|
||
"""
|
||
# # 创建Pipeline(标准化 + Lasso)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 必须标准化,因为Lasso对尺度敏感
|
||
# ('lasso', Lasso(max_iter=10000)) # 增加迭代次数确保收敛
|
||
# ])
|
||
|
||
# # 4. 定义超参数网格
|
||
# param_grid = {
|
||
# 'lasso__alpha': np.logspace(-6, 3, 50), # 正则化系数范围:0.0001到100
|
||
# 'lasso__selection': ['cyclic', 'random'] # 优化算法选择
|
||
# }
|
||
|
||
# # 5. 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 6. 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 7. 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_Lasso.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','Lasso')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_Lasso.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""
|
||
ElasticNet
|
||
"""
|
||
# # 创建Pipeline(标准化 + Lasso)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 必须标准化,因为Lasso对尺度敏感
|
||
# ('model', ElasticNet(max_iter=10000)) # 增加迭代次数确保收敛
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'model__alpha': np.logspace(-4, 2, 50),
|
||
# 'model__l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95] # 控制L1/L2混合比例
|
||
# }
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_ElasticNet.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','ElasticNet')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_ElasticNet.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
"""
|
||
K近邻回归
|
||
"""
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # KNN对特征尺度敏感,必须标准化
|
||
# ('knn', KNeighborsRegressor())
|
||
# ])
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'knn__n_neighbors': np.arange(1, 10), # 最近邻的数量
|
||
# #'knn__weights': ['uniform', 'distance'], # 权重函数
|
||
# #'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # 计算算法
|
||
# # 'knn__leaf_size': np.arange(1, 50), # 叶子节点大小
|
||
# # 'knn__p':[1,2],
|
||
# #'knn__metric': ['euclidean', 'manhattan'] # 距离度量
|
||
# }
|
||
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_KNeighborsRegressor.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','KNeighborsRegressor')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_KNeighborsRegressor.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
"""
|
||
SVR
|
||
"""
|
||
# # 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 优先尝试StandardScaler
|
||
# ('svr', SVR(max_iter=10000)) # 增加迭代次数确保收敛
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], # 核函数类型
|
||
# #'svr__C': np.linspace(0.8,1.2,50), # np.linspace(0,0.2,50)
|
||
# # 'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 10)), # 核系数
|
||
# #'svr__epsilon': np.linspace(0.1,0.2,50) # 控制对噪声的容忍度
|
||
# }
|
||
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_SVR.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','SVR')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_SVR.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
"""
|
||
决策树
|
||
"""
|
||
# 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 虽然决策树不需要,但保留以便比较
|
||
# ('dtr', DecisionTreeRegressor(random_state=42))
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# # 'dtr__criterion': ['squared_error', 'friedman_mse'], # 分裂标准
|
||
# 'dtr__max_depth': np.arange(1,30) ,# [None, 3, 5, 7, 10, 15, 20], # 树的最大深度
|
||
# 'dtr__min_samples_split': np.arange(1,10), # 分裂所需最小样本数
|
||
# 'dtr__min_samples_leaf': np.arange(1,50), # 叶节点最小样本数
|
||
# # 'dtr__max_features': ['auto', 'sqrt', 'log2', None] # 考虑的特征数量
|
||
# }
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_DTR.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','DTR')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_DTR.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
"""
|
||
随机森林
|
||
"""
|
||
## 创建Pipeline
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 可选,随机森林对尺度不敏感
|
||
# ('rfr', RandomForestRegressor(random_state=42, n_jobs=-1))
|
||
# ])
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'rfr__n_estimators': np.arange(1, 50), # 树的数量
|
||
# 'rfr__max_depth': np.arange(1, 50), # 树的最大深度
|
||
# # 'rfr__min_samples_split': [2, 5, 10], # 分裂所需最小样本数
|
||
# # 'rfr__min_samples_leaf': [1, 2, 4], # 叶节点最小样本数
|
||
# # 'rfr__max_features': ['auto', 'sqrt', 'log2', 0.5, 0.8], # 考虑的特征比例
|
||
# # 'rfr__bootstrap': [True, False] # 是否使用bootstrap采样
|
||
# }
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_RFR.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','RFR')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_RFR.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
"""
|
||
ADBT
|
||
"""
|
||
# # 创建Pipeline
|
||
# base_estimator = DecisionTreeRegressor(max_depth=3)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()),
|
||
# ('adb', AdaBoostRegressor(
|
||
# estimator=base_estimator, # 正确传递基础估计器
|
||
# random_state=42
|
||
# ))
|
||
# ])
|
||
|
||
|
||
# # 定义超参数网格
|
||
# param_grid = {
|
||
# 'adb__n_estimators': [50, 100, 200],
|
||
# 'adb__learning_rate': [0.01, 0.1, 0.5, 1.0],
|
||
# 'adb__loss': ['linear', 'square', 'exponential'],
|
||
# # 通过estimator参数传递决策树深度
|
||
# 'adb__estimator': [DecisionTreeRegressor(max_depth=d) for d in [1, 2, 3, 4]]
|
||
# }
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_ADB.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','ADB')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_ADB.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
"""
|
||
XGB
|
||
"""
|
||
|
||
# # 创建Pipeline
|
||
# base_estimator = DecisionTreeRegressor(max_depth=3)
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 可选
|
||
# ('xgb', xgb.XGBRegressor(objective='reg:squarederror',
|
||
# random_state=42,
|
||
# n_jobs=-1))
|
||
# ])
|
||
|
||
|
||
# # 定义超参数网格
|
||
|
||
# param_grid = {
|
||
# # 'xgb__n_estimators': np.arange(50,150), # 树的数量
|
||
# 'xgb__min_child_weight':[9],
|
||
# 'xgb__max_depth': [8], # 树的最大深度
|
||
# #'xgb__learning_rate': [0.01, 0.05, 0.1], # 学习率
|
||
# 'xgb__subsample': [0.9], # 样本采样比例
|
||
# 'xgb__eta':[0.33],
|
||
# # 'xgb__colsample_bytree': [0.6, 0.8, 1.0], # 特征采样比例
|
||
# # 'xgb__gamma': [0, 0.1, 0.2], # 最小分裂损失
|
||
# # 'xgb__reg_alpha': [0, 0.1, 1], # L1正则化
|
||
# # 'xgb__reg_lambda': [0.1, 1, 10] # L2正则化
|
||
# }
|
||
|
||
|
||
# # 设置5折交叉验证
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
|
||
# # 网格搜索(优化超参数)
|
||
# grid_search = GridSearchCV(
|
||
# pipeline,
|
||
# param_grid,
|
||
# cv=kf,
|
||
# scoring='neg_mean_squared_error', # 最小化MSE
|
||
# n_jobs=-1, # 使用所有CPU核心
|
||
# verbose=2 # 打印进度
|
||
# )
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索优化...")
|
||
# grid_search.fit(train_x, train_y)
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# # 使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# # 保存模型
|
||
# dump(best_model, './model/TPV_XGB.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
|
||
# # 验证
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(acc)
|
||
# # 画图保存
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','XGB')
|
||
|
||
# # 加载模型
|
||
# loaded_pipeline = load('./model/TPV_XGB.joblib')
|
||
|
||
# # 使用模型预测
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|
||
|
||
"""GBDT"""
|
||
# pipeline = Pipeline([
|
||
# ('scaler', StandardScaler()), # 可替换为MinMaxScaler()
|
||
# ('gbdt', GradientBoostingRegressor(random_state=42))
|
||
# ])
|
||
# param_grid = {
|
||
# 'gbdt__n_estimators': [20, 30, 40, 50, 60, 70, 80,100], # 树的数量
|
||
# 'gbdt__learning_rate': [0.01, 0.1, 0.2], # 学习率
|
||
# 'gbdt__max_depth': [3, 5, 7], # 树的最大深度
|
||
# 'gbdt__min_samples_split': [2, 5], # 分裂所需最小样本数
|
||
# 'gbdt__min_samples_leaf': [1, 2,3,4,5], # 叶节点最小样本数
|
||
# 'gbdt__max_features': ['sqrt', 'log2'], # 特征选择方式
|
||
# }
|
||
# # 设置K折交叉验证和网格搜索
|
||
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||
# grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error',
|
||
# n_jobs=-1, verbose=2)
|
||
|
||
# # 训练模型
|
||
# print("开始网格搜索...")
|
||
# grid_search.fit(train_x, train_y)
|
||
|
||
# print("\n最佳参数组合:", grid_search.best_params_)
|
||
|
||
# #使用最佳模型进行预测
|
||
# best_model = grid_search.best_estimator_
|
||
|
||
# dump(best_model, './model/TPV_GDBT.joblib')
|
||
|
||
# pred_y = best_model.predict(valid_x)
|
||
# acc = evaluate_model_accuracy(pred_y,valid_y)
|
||
# print(pred_y)
|
||
# print("\n")
|
||
# print(acc)
|
||
# draw_picture(valid_y,pred_y,acc,'./pic/TPV','GDBT')
|
||
# loaded_pipeline = load('./model/TPV_GDBT.joblib')
|
||
# pred_y = loaded_pipeline.predict(valid_x)
|
||
# print(pred_y)
|
||
|