wgz_forecast/pv/pv_train.py

74 lines
3.0 KiB
Python
Raw Normal View History

2025-02-07 15:45:03 +08:00
import pandas as pd
import numpy as np
2025-02-08 15:01:05 +08:00
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from logzero import logger
2025-02-07 15:45:03 +08:00
2025-02-08 15:01:05 +08:00
def time_series_to_supervised(data, columns, n_in=24, n_out=1,dropnan=True):
2025-02-07 15:45:03 +08:00
"""
:param data:作为列表或2D NumPy数组的观察序列需要
:param n_in:作为输入的滞后观察数X值可以在[1..len数据]之间可选默认为1
:param n_out:作为输出的观测数量y值可以在[0..len数据]之间可选的默认为1
:param dropnan:Boolean是否删除具有NaN值的行可选的默认为True
:return:
"""
2025-02-08 15:01:05 +08:00
logger.info(f"正在处理训练数据size{data.shape}")
2025-02-07 15:45:03 +08:00
n_vars = 1 if type(data) is list else data.shape[1]
df = pd.DataFrame(data)
2025-02-08 15:01:05 +08:00
origNames = columns
2025-02-07 15:45:03 +08:00
cols, names = list(), list()
cols.append(df.shift(0))
names += [('%s' % origNames[j]) for j in range(n_vars)]
n_in = max(1, n_in)
for i in range(n_in-1, 0, -1):
time = '(t-%d)' % i
cols.append(df.shift(i))
names += [('%s%s' % (origNames[j], time)) for j in range(n_vars)]
n_out = max(n_out, 0)
for i in range(1, n_out+1):
time = '(t+%d)' % i
cols.append(df.shift(-i))
names += [('%s%s' % (origNames[j], time)) for j in range(n_vars)]
agg = pd.concat(cols, axis=1)
agg.columns = names
if dropnan:
agg.dropna(inplace=True)
2025-02-08 15:01:05 +08:00
return agg
def train_model(train_data: pd.DataFrame):
"""训练模型的函数,需要根据模型类型实际调整
Args:
data (pd.DataFrame): 训练集
"""
# 特征和输出列名,需要根据业务场景灵活处理
fea_cols = train_data.columns[:-1].tolist()
out_cols = train_data.columns[-1:].tolist()
logger.info(fea_cols, out_cols)
X = train_data[fea_cols]
y = train_data[out_cols]
train_X,test_X,train_y,test_y = train_test_split(X, y, test_size=0.2, random_state=42)
valid_X,test_X,valid_y,test_y = train_test_split(test_X, test_y, test_size=0.5, random_state=42)
# 参数
other_params = {'learning_rate': 0.1, 'n_estimators': 150, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
print(train_X.shape, train_y.shape)
gbm = xgb.XGBRegressor(objective='reg:squarederror',**other_params)
gbm.fit(train_X.values, train_y.values, eval_set=[(valid_X.values, valid_y.values)], early_stopping_rounds=20)
y_pred = gbm.predict(test_X.values)
logger.info(f"Root Mean Squared Error on Test set: {np.sqrt(mean_squared_error(test_y, y_pred))}")
logger.info(f"R2 score on Test set: {r2_score(test_y, y_pred)}")
joblib.dump(gbm, './models/pv_pred.joblib')
logger.info(f"save_path: ./models/pv_pred.joblib")
if __name__ == '__main__':
data = pd.read_csv('./data/pv_data_hourly.csv', index_col=0)
agg = time_series_to_supervised(data.values, data.columns, 24, 1)
train_model(agg)