ai_platform_regression/house_price/house_price_predcition.py

140 lines
5.7 KiB
Python

# -*-coding:utf-8-*-
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from logzero import logger
import os
current_path = os.path.dirname(__file__)
def load_data():
logger.info(f"读取本地数据")
logger.info(current_path)
train_data = pd.read_csv(f'{current_path}/data/train.csv')
train_data.drop(train_data[(train_data["GrLivArea"] > 4000) & (train_data["SalePrice"] < 300000)].index,
inplace=True) # pandas 里面的条件索引
return train_data
def load_model():
logger.info(f"读取本地模型")
model = xgb.XGBModel()
model.load_model(f'{current_path}/pretrain_models/house_price_eta0.05_round280.json')
return model
def preprocessing(local_train: pd.DataFrame, new_data: pd.DataFrame):
"""_summary_
Args:
old_data (pd.DataFrame): 本地存储的数据,或者传入的训练数据
new_data (pd.DataFrame): 上传的待预测的数据
Returns:
_type_: _description_
"""
all_data = pd.concat([local_train, new_data]).reset_index(drop=True)
miss = all_data.isnull().sum().sort_values(ascending=True)
all_cols = [x for x in all_data.columns if x != 'Id' and x != 'SalePrice']
for col in all_cols:
if miss[col] > 1000:
logger.info(f"{col}列缺失比例过高,删除")
all_data.drop(columns=[col], inplace=True)
na_index = all_data[all_data['GarageYrBlt'] > 2022].index
year_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
all_data.loc[na_index, 'GarageYrBlt'] = None
all_data.GarageYrBlt.fillna(all_data.YearBuilt, inplace=True)
cols1 = ["GarageQual", "GarageCond", "GarageFinish", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual",
"BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
all_data[col].fillna("None", inplace=True)
cols2 = ["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols2:
all_data[col] = all_data[col].astype(float)
all_data[col].fillna(0, inplace=True)
all_data["LotFrontage"].fillna(np.mean(all_data["LotFrontage"]), inplace=True)
cols3 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual",
"SaleType", "Exterior1st", "Exterior2nd"]
for col in cols3:
all_data[col].fillna(all_data[col].mode()[0], inplace=True)
numeric_cols = [x for x in all_data.select_dtypes(exclude=['object']).columns.tolist() if
x != 'Id' and x != 'SalePrice']
object_cols = [x for x in all_data.select_dtypes(include=['object']).columns.tolist()]
for col in numeric_cols:
all_data[col] = np.log1p(all_data[col])
all_data[col] = (all_data[col] - all_data[col].min()) / (all_data[col].max() - all_data[col].min())
dataset = pd.get_dummies(all_data, columns=object_cols)
return dataset
def build_dataset(dataset):
dataset.SalePrice = np.log1p(dataset.SalePrice)
train = dataset[~dataset.SalePrice.isna()].copy()
feature_cols = [x for x in dataset.columns if x != 'Id' and x != 'SalePrice']
for col in feature_cols:
train[col] = train[col].astype(float)
train, valid = train_test_split(train, test_size=0.1, shuffle=True, random_state=42)
X_train, Y_train = train[feature_cols], train['SalePrice']
X_valid, Y_valid = valid[feature_cols], valid['SalePrice']
dtrain = xgb.DMatrix(X_train, Y_train)
dvalid = xgb.DMatrix(X_valid, Y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
return dtrain, dvalid, watchlist, feature_cols
def build_model(dtrain, dvalid, watchlist, num_iter=5000, early_stop=200, **params):
logger.info('开始本地建模')
model = xgb.train(params, dtrain, evals=watchlist, num_boost_round=num_iter, early_stopping_rounds=early_stop,
verbose_eval=True)
return model
def predict(data: pd.DataFrame, model, feature_cols):
dtest = xgb.DMatrix(data[feature_cols])
result = np.expm1(model.predict(dtest))
data['SalePrice'] = result
return data[['Id', 'SalePrice']].copy()
def run_boston_price(test_data, extra_train_data=None, num_iter=5000, early_stop=200, **params):
if test_data is None:
raise Exception("test data is None", extra_train_data)
if extra_train_data is not None:
# 使用上传数据进行训练
if 'SalePrice' not in extra_train_data.columns:
raise Exception("No SalePrice in train data", extra_train_data)
extra_train_data = extra_train_data[~extra_train_data['SalePrice'].isna()].copy()
if extra_train_data.shape[0] == 0:
raise Exception("train data is None", extra_train_data)
# if extra_train_data.SalePrice.isnull().sum() >= 0:
# raise Exception("train data SalePrice contains None value", extra_train_data)
datasets = preprocessing(extra_train_data, test_data)
dtrain, dvalid, watchlist, feature_cols = build_dataset(datasets)
model = build_model(dtrain, dvalid, watchlist, num_iter=num_iter, early_stop=early_stop, **params)
dtest = datasets[datasets.SalePrice.isna()].copy()
rst = predict(dtest, model, feature_cols)
return rst
else:
if test_data is None:
raise Exception("test data is None", extra_train_data)
train_data = load_data()
datasets = preprocessing(train_data, test_data)
_, _, _, feature_cols = build_dataset(datasets)
model = load_model()
dtest = datasets[datasets.SalePrice.isna()].copy()
result = np.expm1(model.predict(dtest[feature_cols]))
dtest['SalePrice'] = result
return dtest[['Id', 'SalePrice']]
if __name__ == '__main__':
pass