140 lines
5.7 KiB
Python
140 lines
5.7 KiB
Python
# -*-coding:utf-8-*-
|
|
import xgboost as xgb
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from logzero import logger
|
|
import os
|
|
|
|
current_path = os.path.dirname(__file__)
|
|
|
|
|
|
def load_data():
|
|
logger.info(f"读取本地数据")
|
|
logger.info(current_path)
|
|
train_data = pd.read_csv(f'{current_path}/data/train.csv')
|
|
train_data.drop(train_data[(train_data["GrLivArea"] > 4000) & (train_data["SalePrice"] < 300000)].index,
|
|
inplace=True) # pandas 里面的条件索引
|
|
return train_data
|
|
|
|
|
|
def load_model():
|
|
logger.info(f"读取本地模型")
|
|
model = xgb.XGBModel()
|
|
model.load_model(f'{current_path}/pretrain_models/house_price_eta0.05_round280.json')
|
|
return model
|
|
|
|
|
|
def preprocessing(local_train: pd.DataFrame, new_data: pd.DataFrame):
|
|
"""_summary_
|
|
|
|
Args:
|
|
old_data (pd.DataFrame): 本地存储的数据,或者传入的训练数据
|
|
new_data (pd.DataFrame): 上传的待预测的数据
|
|
|
|
Returns:
|
|
_type_: _description_
|
|
"""
|
|
all_data = pd.concat([local_train, new_data]).reset_index(drop=True)
|
|
miss = all_data.isnull().sum().sort_values(ascending=True)
|
|
all_cols = [x for x in all_data.columns if x != 'Id' and x != 'SalePrice']
|
|
for col in all_cols:
|
|
if miss[col] > 1000:
|
|
logger.info(f"{col}列缺失比例过高,删除")
|
|
all_data.drop(columns=[col], inplace=True)
|
|
na_index = all_data[all_data['GarageYrBlt'] > 2022].index
|
|
year_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
|
|
all_data.loc[na_index, 'GarageYrBlt'] = None
|
|
all_data.GarageYrBlt.fillna(all_data.YearBuilt, inplace=True)
|
|
|
|
cols1 = ["GarageQual", "GarageCond", "GarageFinish", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual",
|
|
"BsmtFinType2", "BsmtFinType1", "MasVnrType"]
|
|
for col in cols1:
|
|
all_data[col].fillna("None", inplace=True)
|
|
|
|
cols2 = ["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
|
|
for col in cols2:
|
|
all_data[col] = all_data[col].astype(float)
|
|
all_data[col].fillna(0, inplace=True)
|
|
all_data["LotFrontage"].fillna(np.mean(all_data["LotFrontage"]), inplace=True)
|
|
|
|
cols3 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual",
|
|
"SaleType", "Exterior1st", "Exterior2nd"]
|
|
for col in cols3:
|
|
all_data[col].fillna(all_data[col].mode()[0], inplace=True)
|
|
|
|
numeric_cols = [x for x in all_data.select_dtypes(exclude=['object']).columns.tolist() if
|
|
x != 'Id' and x != 'SalePrice']
|
|
object_cols = [x for x in all_data.select_dtypes(include=['object']).columns.tolist()]
|
|
|
|
for col in numeric_cols:
|
|
all_data[col] = np.log1p(all_data[col])
|
|
|
|
all_data[col] = (all_data[col] - all_data[col].min()) / (all_data[col].max() - all_data[col].min())
|
|
dataset = pd.get_dummies(all_data, columns=object_cols)
|
|
return dataset
|
|
|
|
|
|
def build_dataset(dataset):
|
|
dataset.SalePrice = np.log1p(dataset.SalePrice)
|
|
train = dataset[~dataset.SalePrice.isna()].copy()
|
|
feature_cols = [x for x in dataset.columns if x != 'Id' and x != 'SalePrice']
|
|
for col in feature_cols:
|
|
train[col] = train[col].astype(float)
|
|
train, valid = train_test_split(train, test_size=0.1, shuffle=True, random_state=42)
|
|
X_train, Y_train = train[feature_cols], train['SalePrice']
|
|
X_valid, Y_valid = valid[feature_cols], valid['SalePrice']
|
|
dtrain = xgb.DMatrix(X_train, Y_train)
|
|
dvalid = xgb.DMatrix(X_valid, Y_valid)
|
|
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
|
|
return dtrain, dvalid, watchlist, feature_cols
|
|
|
|
|
|
def build_model(dtrain, dvalid, watchlist, num_iter=5000, early_stop=200, **params):
|
|
logger.info('开始本地建模')
|
|
model = xgb.train(params, dtrain, evals=watchlist, num_boost_round=num_iter, early_stopping_rounds=early_stop,
|
|
verbose_eval=True)
|
|
return model
|
|
|
|
|
|
def predict(data: pd.DataFrame, model, feature_cols):
|
|
dtest = xgb.DMatrix(data[feature_cols])
|
|
result = np.expm1(model.predict(dtest))
|
|
data['SalePrice'] = result
|
|
return data[['Id', 'SalePrice']].copy()
|
|
|
|
|
|
def run_boston_price(test_data, extra_train_data=None, num_iter=5000, early_stop=200, **params):
|
|
if test_data is None:
|
|
raise Exception("test data is None", extra_train_data)
|
|
if extra_train_data is not None:
|
|
# 使用上传数据进行训练
|
|
if 'SalePrice' not in extra_train_data.columns:
|
|
raise Exception("No SalePrice in train data", extra_train_data)
|
|
extra_train_data = extra_train_data[~extra_train_data['SalePrice'].isna()].copy()
|
|
if extra_train_data.shape[0] == 0:
|
|
raise Exception("train data is None", extra_train_data)
|
|
# if extra_train_data.SalePrice.isnull().sum() >= 0:
|
|
# raise Exception("train data SalePrice contains None value", extra_train_data)
|
|
datasets = preprocessing(extra_train_data, test_data)
|
|
dtrain, dvalid, watchlist, feature_cols = build_dataset(datasets)
|
|
model = build_model(dtrain, dvalid, watchlist, num_iter=num_iter, early_stop=early_stop, **params)
|
|
dtest = datasets[datasets.SalePrice.isna()].copy()
|
|
rst = predict(dtest, model, feature_cols)
|
|
return rst
|
|
else:
|
|
if test_data is None:
|
|
raise Exception("test data is None", extra_train_data)
|
|
train_data = load_data()
|
|
datasets = preprocessing(train_data, test_data)
|
|
_, _, _, feature_cols = build_dataset(datasets)
|
|
model = load_model()
|
|
dtest = datasets[datasets.SalePrice.isna()].copy()
|
|
result = np.expm1(model.predict(dtest[feature_cols]))
|
|
dtest['SalePrice'] = result
|
|
return dtest[['Id', 'SalePrice']]
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pass
|