# -*-coding:utf-8-*- import xgboost as xgb import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from logzero import logger import os current_path = os.path.dirname(__file__) def load_data(): logger.info(f"读取本地数据") logger.info(current_path) train_data = pd.read_csv(f'{current_path}/data/train.csv') train_data.drop(train_data[(train_data["GrLivArea"] > 4000) & (train_data["SalePrice"] < 300000)].index, inplace=True) # pandas 里面的条件索引 return train_data def load_model(): logger.info(f"读取本地模型") model = xgb.XGBModel() model.load_model(f'{current_path}/pretrain_models/house_price_eta0.05_round280.json') return model def preprocessing(local_train: pd.DataFrame, new_data: pd.DataFrame): """_summary_ Args: old_data (pd.DataFrame): 本地存储的数据,或者传入的训练数据 new_data (pd.DataFrame): 上传的待预测的数据 Returns: _type_: _description_ """ all_data = pd.concat([local_train, new_data]).reset_index(drop=True) miss = all_data.isnull().sum().sort_values(ascending=True) all_cols = [x for x in all_data.columns if x != 'Id' and x != 'SalePrice'] for col in all_cols: if miss[col] > 1000: logger.info(f"{col}列缺失比例过高,删除") all_data.drop(columns=[col], inplace=True) na_index = all_data[all_data['GarageYrBlt'] > 2022].index year_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'] all_data.loc[na_index, 'GarageYrBlt'] = None all_data.GarageYrBlt.fillna(all_data.YearBuilt, inplace=True) cols1 = ["GarageQual", "GarageCond", "GarageFinish", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"] for col in cols1: all_data[col].fillna("None", inplace=True) cols2 = ["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"] for col in cols2: all_data[col] = all_data[col].astype(float) all_data[col].fillna(0, inplace=True) all_data["LotFrontage"].fillna(np.mean(all_data["LotFrontage"]), inplace=True) cols3 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType", "Exterior1st", "Exterior2nd"] for col in cols3: all_data[col].fillna(all_data[col].mode()[0], inplace=True) numeric_cols = [x for x in all_data.select_dtypes(exclude=['object']).columns.tolist() if x != 'Id' and x != 'SalePrice'] object_cols = [x for x in all_data.select_dtypes(include=['object']).columns.tolist()] for col in numeric_cols: all_data[col] = np.log1p(all_data[col]) all_data[col] = (all_data[col] - all_data[col].min()) / (all_data[col].max() - all_data[col].min()) dataset = pd.get_dummies(all_data, columns=object_cols) return dataset def build_dataset(dataset): dataset.SalePrice = np.log1p(dataset.SalePrice) train = dataset[~dataset.SalePrice.isna()].copy() feature_cols = [x for x in dataset.columns if x != 'Id' and x != 'SalePrice'] for col in feature_cols: train[col] = train[col].astype(float) train, valid = train_test_split(train, test_size=0.1, shuffle=True, random_state=42) X_train, Y_train = train[feature_cols], train['SalePrice'] X_valid, Y_valid = valid[feature_cols], valid['SalePrice'] dtrain = xgb.DMatrix(X_train, Y_train) dvalid = xgb.DMatrix(X_valid, Y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] return dtrain, dvalid, watchlist, feature_cols def build_model(dtrain, dvalid, watchlist, num_iter=5000, early_stop=200, **params): logger.info('开始本地建模') model = xgb.train(params, dtrain, evals=watchlist, num_boost_round=num_iter, early_stopping_rounds=early_stop, verbose_eval=True) return model def predict(data: pd.DataFrame, model, feature_cols): dtest = xgb.DMatrix(data[feature_cols]) result = np.expm1(model.predict(dtest)) data['SalePrice'] = result return data[['Id', 'SalePrice']].copy() def run_boston_price(test_data, extra_train_data=None, num_iter=5000, early_stop=200, **params): if test_data is None: raise Exception("test data is None", extra_train_data) if extra_train_data is not None: # 使用上传数据进行训练 if 'SalePrice' not in extra_train_data.columns: raise Exception("No SalePrice in train data", extra_train_data) extra_train_data = extra_train_data[~extra_train_data['SalePrice'].isna()].copy() if extra_train_data.shape[0] == 0: raise Exception("train data is None", extra_train_data) # if extra_train_data.SalePrice.isnull().sum() >= 0: # raise Exception("train data SalePrice contains None value", extra_train_data) datasets = preprocessing(extra_train_data, test_data) dtrain, dvalid, watchlist, feature_cols = build_dataset(datasets) model = build_model(dtrain, dvalid, watchlist, num_iter=num_iter, early_stop=early_stop, **params) dtest = datasets[datasets.SalePrice.isna()].copy() rst = predict(dtest, model, feature_cols) return rst else: if test_data is None: raise Exception("test data is None", extra_train_data) train_data = load_data() datasets = preprocessing(train_data, test_data) _, _, _, feature_cols = build_dataset(datasets) model = load_model() dtest = datasets[datasets.SalePrice.isna()].copy() result = np.expm1(model.predict(dtest[feature_cols])) dtest['SalePrice'] = result return dtest[['Id', 'SalePrice']] if __name__ == '__main__': pass