XgBoost(eXtreme Gradient Boosting)是常用的分类或回归任务模型框架,波士顿房价预测可以视为一种回归任务。本章将从数据处理层面开始,逐步建立xgboost的回归预测模型,并提供参数调整的思路。其他基于多维度特征的回归任务,均可以采用这种方法进行建模。
# 需要安装必备的模块,如果本地环境中已经有了这些模块,则可以跳过
# ! pip install -U xgboost
# ! pip install -U pandas
# ! pip install -U numpy
# ! pip install -U scikit-learn, matplotlib, seaborn, scipy
# 导入必要的模块
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import norm
from scipy import stats
读入数据
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
数据相关性分析
corrmat = train_data.corr()
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(corrmat, vmax=.8, square=True)
<AxesSubplot:>
挑选相关程度高的特征
k = 10
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels=cols.values, xticklabels=cols.values)
plt.show()
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train_data[cols], size = 2.5)
plt.show()
/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/seaborn/axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
根据特征GrLivArea
和价格SalePrice
的图像观察,存在两个异常点(面积大,房价反而低),从经验判断可以将这两条样本删除。同理,我们可以删除以下异常点所在的样本
train_data.drop(train_data[(train_data.GrLivArea>4000)&(train_data.SalePrice<200000)].index,inplace=True)
train_data.drop(train_data[(train_data.OverallQual<5)&(train_data.SalePrice>200000)].index,inplace=True)
train_data.drop(train_data[(train_data.YearBuilt<1900)&(train_data.SalePrice>400000)].index,inplace=True)
train_data.drop(train_data[(train_data.YearBuilt>1980)&(train_data.SalePrice>700000)].index,inplace=True)
train_data.drop(train_data[(train_data.TotalBsmtSF>6000)&(train_data.SalePrice<200000)].index,inplace=True)
train_data.reset_index(drop=True,inplace=True)
train_data.shape
(1454, 81)
特征缺失比例分析
# 分析特征缺失比例时,将训练集和测试集同时分析,避免训练集和测试集分布不均匀的情况。
all_data = pd.concat([train_data, test_data]).reset_index(drop=True)
miss_value = train_data.isnull().sum().sort_values(ascending=False).to_frame().reset_index()
miss_value.columns = ['feature', 'miss_count']
miss_value = miss_value[miss_value.miss_count > 0]
miss_value['miss_per'] = miss_value.miss_count / train_data.shape[0]
miss_value.columns = ['特征名称', '缺失量', '缺失率']
miss_value
特征名称 | 缺失量 | 缺失率 | |
---|---|---|---|
0 | PoolQC | 1449 | 0.996561 |
1 | MiscFeature | 1400 | 0.962861 |
2 | Alley | 1363 | 0.937414 |
3 | Fence | 1175 | 0.808116 |
4 | FireplaceQu | 690 | 0.474553 |
5 | LotFrontage | 258 | 0.177442 |
6 | GarageYrBlt | 81 | 0.055708 |
7 | GarageCond | 81 | 0.055708 |
8 | GarageType | 81 | 0.055708 |
9 | GarageFinish | 81 | 0.055708 |
10 | GarageQual | 81 | 0.055708 |
11 | BsmtFinType2 | 38 | 0.026135 |
12 | BsmtExposure | 38 | 0.026135 |
13 | BsmtQual | 37 | 0.025447 |
14 | BsmtCond | 37 | 0.025447 |
15 | BsmtFinType1 | 37 | 0.025447 |
16 | MasVnrArea | 8 | 0.005502 |
17 | MasVnrType | 8 | 0.005502 |
18 | Electrical | 1 | 0.000688 |
all_cols = [x for x in all_data.columns if x != 'Id' and x != 'SalePrice']
drop_cols = miss_value[miss_value['缺失率']>0.15]['特征名称'].values.tolist() # 删除缺失率大于15%的特征
for col in all_cols:
if col in drop_cols:
print(f"删除{col}列")
all_data.drop(columns=[col], inplace=True)
all_data.shape
删除LotFrontage列 删除Alley列 删除FireplaceQu列 删除PoolQC列 删除Fence列 删除MiscFeature列
(2913, 75)
我们通过scipy的norm模块辅助打印房价列的分布图和Q-Q图
sns.distplot(train_data.SalePrice, fit=norm)
/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='SalePrice', ylabel='Density'>
rest = stats.probplot(train_data.SalePrice, plot=plt)
可见其呈偏态分布,而许多机器学习算法都是基于数据是高斯分布的条件下推导出来的,因此,这里先把房价处理成为高斯分布的形式。我们使用numpy的log1p方式使其对数化,再次打印分布图。
sns.distplot(np.log1p(train_data.SalePrice), fit=norm)
/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='SalePrice', ylabel='Density'>
rest = stats.probplot(np.log1p(train_data.SalePrice), plot=plt)
接下来进行缺失值的填充和离散变量的独热化(one-hot)
na_index = all_data[all_data['GarageYrBlt'] > 2022].index
year_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
all_data.loc[na_index, 'GarageYrBlt'] = None
all_data.GarageYrBlt.fillna(all_data.YearBuilt, inplace=True)
cols1 = ["GarageQual", "GarageCond", "GarageFinish", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
# 为缺失的部分填充 "None"值,此处也可以采取其他填充方法。
all_data[col].fillna("None",inplace=True)
cols2=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols2:
# 将部分字符特征转换为数值特征。
all_data[col] = all_data[col].astype(float)
all_data[col].fillna(0, inplace=True)
cols3 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols3:
# 为部分特征填充为该特征的众数值
all_data[col].fillna(all_data[col].mode()[0], inplace=True)
numeric_cols = [x for x in all_data.select_dtypes(exclude=['object']).columns.tolist() if x != 'Id' and x != 'SalePrice']
object_cols = [x for x in all_data.select_dtypes(include=['object']).columns.tolist()]
for col in numeric_cols:
# 对数值特征取对数。
all_data[col] = np.log1p(all_data[col])
all_data[col] = (all_data[col] - all_data[col].min()) / (all_data[col].max() - all_data[col].min())
dataset = pd.get_dummies(all_data, columns=object_cols) # 对指定列进行独热化。
数据预处理完成后,开始建模训练。
# 首先进行房价的对数化处理
dataset.SalePrice = np.log1p(dataset.SalePrice)
# 由于dataset是原始数据train.csv和test.csv的合并,现在需要将其分离
train = dataset[~dataset.SalePrice.isna()].copy()
test = dataset[dataset.SalePrice.isna()].copy()
# id列和房价列不能作为特征进行训练。
feature_cols = [x for x in dataset.columns if x != 'Id' and x != 'SalePrice']
train.shape, test.shape
((1454, 279), (1459, 279))
# 划分训练集与验证集
train, valid = train_test_split(train, test_size=0.15, shuffle=True, random_state=42)
X_train, Y_train = train[feature_cols], train['SalePrice']
X_valid, Y_valid = valid[feature_cols], valid['SalePrice']
X_test, Y_test = test[feature_cols], test['SalePrice']
# 转换为xgboost Dmatrix格式
dtrain = xgb.DMatrix(X_train, Y_train)
dvalid = xgb.DMatrix(X_valid, Y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
# 超参数,根据经验或者工具进行调整。
params = {'objective': 'reg:squarederror',
'booster': 'gbtree',
'eta': 0.05,
'max_depth': 15,
'subsample': 0.7,
'colsample_bytree': 0.7,
'eval_metric':['mae'],
'silent': 1,
'seed': 10}
#
gbm = xgb.train(params, dtrain, evals=watchlist, num_boost_round=5000,
early_stopping_rounds=200, verbose_eval=True)
[10:15:05] WARNING: ../src/learner.cc:627: Parameters: { "silent" } might not be used. This could be a false alarm, with some parameters getting used by language bindings but then being mistakenly passed down to XGBoost core, or some parameter actually being used but getting flagged wrongly here. Please open an issue if you find any such cases. [0] train-mae:10.94380 eval-mae:10.95373 [1] train-mae:10.39788 eval-mae:10.40781 [2] train-mae:9.87831 eval-mae:9.88825 [3] train-mae:9.38518 eval-mae:9.39511 [4] train-mae:8.91638 eval-mae:8.92631 [5] train-mae:8.47086 eval-mae:8.48079 [6] train-mae:8.04705 eval-mae:8.05698 [7] train-mae:7.64482 eval-mae:7.65475 [8] train-mae:7.26302 eval-mae:7.27295 [9] train-mae:6.90058 eval-mae:6.91091 [10] train-mae:6.55604 eval-mae:6.56675 [11] train-mae:6.22897 eval-mae:6.24004 [12] train-mae:5.91847 eval-mae:5.92989 [13] train-mae:5.62276 eval-mae:5.63416 [14] train-mae:5.34225 eval-mae:5.35307 [15] train-mae:5.07554 eval-mae:5.08648 [16] train-mae:4.82250 eval-mae:4.83335 [17] train-mae:4.58225 eval-mae:4.59304 [18] train-mae:4.35379 eval-mae:4.36488 [19] train-mae:4.13637 eval-mae:4.14766 [20] train-mae:3.92991 eval-mae:3.94058 [21] train-mae:3.73408 eval-mae:3.74440 [22] train-mae:3.54787 eval-mae:3.55775 [23] train-mae:3.37118 eval-mae:3.38045 [24] train-mae:3.20273 eval-mae:3.21193 [25] train-mae:3.04333 eval-mae:3.05271 [26] train-mae:2.89196 eval-mae:2.90081 [27] train-mae:2.74787 eval-mae:2.75642 [28] train-mae:2.61066 eval-mae:2.61911 [29] train-mae:2.48026 eval-mae:2.48803 [30] train-mae:2.35695 eval-mae:2.36388 [31] train-mae:2.24017 eval-mae:2.24696 [32] train-mae:2.12849 eval-mae:2.13557 [33] train-mae:2.02250 eval-mae:2.02905 [34] train-mae:1.92202 eval-mae:1.92878 [35] train-mae:1.82627 eval-mae:1.83296 [36] train-mae:1.73537 eval-mae:1.74218 [37] train-mae:1.64903 eval-mae:1.65516 [38] train-mae:1.56712 eval-mae:1.57293 [39] train-mae:1.48943 eval-mae:1.49527 [40] train-mae:1.41519 eval-mae:1.42074 [41] train-mae:1.34458 eval-mae:1.34948 [42] train-mae:1.27784 eval-mae:1.28242 [43] train-mae:1.21435 eval-mae:1.21839 [44] train-mae:1.15402 eval-mae:1.15756 [45] train-mae:1.09665 eval-mae:1.10007 [46] train-mae:1.04210 eval-mae:1.04551 [47] train-mae:0.99041 eval-mae:0.99436 [48] train-mae:0.94127 eval-mae:0.94568 [49] train-mae:0.89444 eval-mae:0.89971 [50] train-mae:0.85008 eval-mae:0.85600 [51] train-mae:0.80779 eval-mae:0.81423 [52] train-mae:0.76771 eval-mae:0.77476 [53] train-mae:0.72987 eval-mae:0.73775 [54] train-mae:0.69380 eval-mae:0.70241 [55] train-mae:0.65960 eval-mae:0.66919 [56] train-mae:0.62719 eval-mae:0.63732 [57] train-mae:0.59654 eval-mae:0.60691 [58] train-mae:0.56716 eval-mae:0.57775 [59] train-mae:0.53946 eval-mae:0.55041 [60] train-mae:0.51319 eval-mae:0.52456 [61] train-mae:0.48800 eval-mae:0.49918 [62] train-mae:0.46416 eval-mae:0.47589 [63] train-mae:0.44142 eval-mae:0.45406 [64] train-mae:0.41982 eval-mae:0.43357 [65] train-mae:0.39956 eval-mae:0.41354 [66] train-mae:0.38027 eval-mae:0.39490 [67] train-mae:0.36177 eval-mae:0.37692 [68] train-mae:0.34445 eval-mae:0.36028 [69] train-mae:0.32787 eval-mae:0.34360 [70] train-mae:0.31217 eval-mae:0.32780 [71] train-mae:0.29722 eval-mae:0.31332 [72] train-mae:0.28291 eval-mae:0.29955 [73] train-mae:0.26935 eval-mae:0.28658 [74] train-mae:0.25653 eval-mae:0.27413 [75] train-mae:0.24428 eval-mae:0.26205 [76] train-mae:0.23249 eval-mae:0.25154 [77] train-mae:0.22134 eval-mae:0.24116 [78] train-mae:0.21076 eval-mae:0.23164 [79] train-mae:0.20091 eval-mae:0.22300 [80] train-mae:0.19150 eval-mae:0.21471 [81] train-mae:0.18263 eval-mae:0.20687 [82] train-mae:0.17403 eval-mae:0.20009 [83] train-mae:0.16595 eval-mae:0.19337 [84] train-mae:0.15827 eval-mae:0.18678 [85] train-mae:0.15087 eval-mae:0.18038 [86] train-mae:0.14400 eval-mae:0.17454 [87] train-mae:0.13742 eval-mae:0.16949 [88] train-mae:0.13106 eval-mae:0.16428 [89] train-mae:0.12540 eval-mae:0.15963 [90] train-mae:0.11969 eval-mae:0.15519 [91] train-mae:0.11448 eval-mae:0.15093 [92] train-mae:0.10930 eval-mae:0.14682 [93] train-mae:0.10455 eval-mae:0.14338 [94] train-mae:0.10003 eval-mae:0.13999 [95] train-mae:0.09564 eval-mae:0.13644 [96] train-mae:0.09144 eval-mae:0.13335 [97] train-mae:0.08762 eval-mae:0.13035 [98] train-mae:0.08395 eval-mae:0.12789 [99] train-mae:0.08045 eval-mae:0.12548 [100] train-mae:0.07707 eval-mae:0.12309 [101] train-mae:0.07394 eval-mae:0.12116 [102] train-mae:0.07082 eval-mae:0.11913 [103] train-mae:0.06797 eval-mae:0.11756 [104] train-mae:0.06518 eval-mae:0.11594 [105] train-mae:0.06264 eval-mae:0.11449 [106] train-mae:0.06013 eval-mae:0.11316 [107] train-mae:0.05780 eval-mae:0.11176 [108] train-mae:0.05549 eval-mae:0.11055 [109] train-mae:0.05343 eval-mae:0.10964 [110] train-mae:0.05133 eval-mae:0.10860 [111] train-mae:0.04946 eval-mae:0.10771 [112] train-mae:0.04768 eval-mae:0.10681 [113] train-mae:0.04600 eval-mae:0.10598 [114] train-mae:0.04427 eval-mae:0.10539 [115] train-mae:0.04268 eval-mae:0.10474 [116] train-mae:0.04125 eval-mae:0.10426 [117] train-mae:0.03983 eval-mae:0.10374 [118] train-mae:0.03848 eval-mae:0.10327 [119] train-mae:0.03704 eval-mae:0.10277 [120] train-mae:0.03582 eval-mae:0.10236 [121] train-mae:0.03464 eval-mae:0.10200 [122] train-mae:0.03350 eval-mae:0.10129 [123] train-mae:0.03241 eval-mae:0.10085 [124] train-mae:0.03136 eval-mae:0.10049 [125] train-mae:0.03039 eval-mae:0.10004 [126] train-mae:0.02943 eval-mae:0.09963 [127] train-mae:0.02849 eval-mae:0.09921 [128] train-mae:0.02769 eval-mae:0.09877 [129] train-mae:0.02691 eval-mae:0.09846 [130] train-mae:0.02618 eval-mae:0.09807 [131] train-mae:0.02537 eval-mae:0.09771 [132] train-mae:0.02470 eval-mae:0.09746 [133] train-mae:0.02396 eval-mae:0.09717 [134] train-mae:0.02336 eval-mae:0.09695 [135] train-mae:0.02275 eval-mae:0.09672 [136] train-mae:0.02217 eval-mae:0.09649 [137] train-mae:0.02161 eval-mae:0.09628 [138] train-mae:0.02098 eval-mae:0.09621 [139] train-mae:0.02037 eval-mae:0.09610 [140] train-mae:0.01993 eval-mae:0.09598 [141] train-mae:0.01940 eval-mae:0.09575 [142] train-mae:0.01897 eval-mae:0.09562 [143] train-mae:0.01850 eval-mae:0.09542 [144] train-mae:0.01795 eval-mae:0.09538 [145] train-mae:0.01759 eval-mae:0.09527 [146] train-mae:0.01715 eval-mae:0.09511 [147] train-mae:0.01673 eval-mae:0.09504 [148] train-mae:0.01631 eval-mae:0.09494 [149] train-mae:0.01591 eval-mae:0.09485 [150] train-mae:0.01556 eval-mae:0.09477 [151] train-mae:0.01521 eval-mae:0.09458 [152] train-mae:0.01486 eval-mae:0.09460 [153] train-mae:0.01456 eval-mae:0.09451 [154] train-mae:0.01422 eval-mae:0.09440 [155] train-mae:0.01402 eval-mae:0.09431 [156] train-mae:0.01366 eval-mae:0.09424 [157] train-mae:0.01334 eval-mae:0.09414 [158] train-mae:0.01304 eval-mae:0.09418 [159] train-mae:0.01279 eval-mae:0.09415 [160] train-mae:0.01259 eval-mae:0.09413 [161] train-mae:0.01234 eval-mae:0.09409 [162] train-mae:0.01214 eval-mae:0.09399 [163] train-mae:0.01194 eval-mae:0.09394 [164] train-mae:0.01170 eval-mae:0.09390 [165] train-mae:0.01144 eval-mae:0.09391 [166] train-mae:0.01119 eval-mae:0.09387 [167] train-mae:0.01104 eval-mae:0.09383 [168] train-mae:0.01088 eval-mae:0.09383 [169] train-mae:0.01070 eval-mae:0.09380 [170] train-mae:0.01050 eval-mae:0.09371 [171] train-mae:0.01032 eval-mae:0.09369 [172] train-mae:0.01007 eval-mae:0.09366 [173] train-mae:0.00984 eval-mae:0.09366 [174] train-mae:0.00966 eval-mae:0.09361 [175] train-mae:0.00947 eval-mae:0.09358 [176] train-mae:0.00936 eval-mae:0.09354 [177] train-mae:0.00920 eval-mae:0.09351 [178] train-mae:0.00905 eval-mae:0.09345 [179] train-mae:0.00886 eval-mae:0.09339 [180] train-mae:0.00874 eval-mae:0.09337 [181] train-mae:0.00861 eval-mae:0.09336 [182] train-mae:0.00843 eval-mae:0.09330 [183] train-mae:0.00832 eval-mae:0.09327 [184] train-mae:0.00821 eval-mae:0.09327 [185] train-mae:0.00807 eval-mae:0.09321 [186] train-mae:0.00796 eval-mae:0.09317 [187] train-mae:0.00782 eval-mae:0.09314 [188] train-mae:0.00765 eval-mae:0.09313 [189] train-mae:0.00753 eval-mae:0.09311 [190] train-mae:0.00744 eval-mae:0.09315 [191] train-mae:0.00733 eval-mae:0.09313 [192] train-mae:0.00721 eval-mae:0.09309 [193] train-mae:0.00706 eval-mae:0.09307 [194] train-mae:0.00690 eval-mae:0.09307 [195] train-mae:0.00675 eval-mae:0.09307 [196] train-mae:0.00661 eval-mae:0.09307 [197] train-mae:0.00649 eval-mae:0.09308 [198] train-mae:0.00642 eval-mae:0.09305 [199] train-mae:0.00633 eval-mae:0.09302 [200] train-mae:0.00622 eval-mae:0.09301 [201] train-mae:0.00611 eval-mae:0.09298 [202] train-mae:0.00602 eval-mae:0.09298 [203] train-mae:0.00589 eval-mae:0.09294 [204] train-mae:0.00581 eval-mae:0.09294 [205] train-mae:0.00571 eval-mae:0.09296 [206] train-mae:0.00562 eval-mae:0.09294 [207] train-mae:0.00555 eval-mae:0.09294 [208] train-mae:0.00545 eval-mae:0.09292 [209] train-mae:0.00536 eval-mae:0.09293 [210] train-mae:0.00526 eval-mae:0.09294 [211] train-mae:0.00519 eval-mae:0.09294 [212] train-mae:0.00510 eval-mae:0.09295 [213] train-mae:0.00503 eval-mae:0.09297 [214] train-mae:0.00498 eval-mae:0.09295 [215] train-mae:0.00489 eval-mae:0.09297 [216] train-mae:0.00480 eval-mae:0.09298 [217] train-mae:0.00471 eval-mae:0.09298 [218] train-mae:0.00464 eval-mae:0.09295 [219] train-mae:0.00457 eval-mae:0.09295 [220] train-mae:0.00450 eval-mae:0.09296 [221] train-mae:0.00443 eval-mae:0.09296 [222] train-mae:0.00436 eval-mae:0.09296 [223] train-mae:0.00431 eval-mae:0.09297 [224] train-mae:0.00422 eval-mae:0.09296 [225] train-mae:0.00414 eval-mae:0.09297 [226] train-mae:0.00409 eval-mae:0.09297 [227] train-mae:0.00405 eval-mae:0.09297 [228] train-mae:0.00398 eval-mae:0.09297 [229] train-mae:0.00396 eval-mae:0.09296 [230] train-mae:0.00390 eval-mae:0.09297 [231] train-mae:0.00385 eval-mae:0.09295 [232] train-mae:0.00379 eval-mae:0.09295 [233] train-mae:0.00373 eval-mae:0.09295 [234] train-mae:0.00366 eval-mae:0.09294 [235] train-mae:0.00363 eval-mae:0.09294 [236] train-mae:0.00355 eval-mae:0.09294 [237] train-mae:0.00350 eval-mae:0.09296 [238] train-mae:0.00343 eval-mae:0.09295 [239] train-mae:0.00339 eval-mae:0.09296 [240] train-mae:0.00333 eval-mae:0.09297 [241] train-mae:0.00328 eval-mae:0.09299 [242] train-mae:0.00325 eval-mae:0.09298 [243] train-mae:0.00320 eval-mae:0.09299 [244] train-mae:0.00315 eval-mae:0.09299 [245] train-mae:0.00310 eval-mae:0.09298 [246] train-mae:0.00306 eval-mae:0.09300 [247] train-mae:0.00300 eval-mae:0.09300 [248] train-mae:0.00295 eval-mae:0.09300 [249] train-mae:0.00291 eval-mae:0.09299 [250] train-mae:0.00286 eval-mae:0.09298 [251] train-mae:0.00282 eval-mae:0.09297 [252] train-mae:0.00276 eval-mae:0.09297 [253] train-mae:0.00272 eval-mae:0.09296 [254] train-mae:0.00268 eval-mae:0.09295 [255] train-mae:0.00263 eval-mae:0.09294 [256] train-mae:0.00259 eval-mae:0.09295 [257] train-mae:0.00255 eval-mae:0.09295 [258] train-mae:0.00251 eval-mae:0.09295 [259] train-mae:0.00246 eval-mae:0.09294 [260] train-mae:0.00243 eval-mae:0.09294 [261] train-mae:0.00240 eval-mae:0.09294 [262] train-mae:0.00237 eval-mae:0.09294 [263] train-mae:0.00233 eval-mae:0.09295 [264] train-mae:0.00229 eval-mae:0.09295 [265] train-mae:0.00226 eval-mae:0.09295 [266] train-mae:0.00223 eval-mae:0.09295 [267] train-mae:0.00219 eval-mae:0.09293 [268] train-mae:0.00215 eval-mae:0.09294 [269] train-mae:0.00213 eval-mae:0.09294 [270] train-mae:0.00210 eval-mae:0.09292 [271] train-mae:0.00207 eval-mae:0.09291 [272] train-mae:0.00204 eval-mae:0.09291 [273] train-mae:0.00202 eval-mae:0.09292 [274] train-mae:0.00198 eval-mae:0.09292 [275] train-mae:0.00195 eval-mae:0.09293 [276] train-mae:0.00192 eval-mae:0.09293 [277] train-mae:0.00189 eval-mae:0.09293 [278] train-mae:0.00186 eval-mae:0.09293 [279] train-mae:0.00182 eval-mae:0.09294 [280] train-mae:0.00179 eval-mae:0.09294 [281] train-mae:0.00175 eval-mae:0.09293 [282] train-mae:0.00173 eval-mae:0.09293 [283] train-mae:0.00170 eval-mae:0.09292 [284] train-mae:0.00169 eval-mae:0.09292 [285] train-mae:0.00166 eval-mae:0.09291 [286] train-mae:0.00163 eval-mae:0.09293 [287] train-mae:0.00161 eval-mae:0.09292 [288] train-mae:0.00159 eval-mae:0.09292 [289] train-mae:0.00156 eval-mae:0.09292 [290] train-mae:0.00153 eval-mae:0.09293 [291] train-mae:0.00150 eval-mae:0.09292 [292] train-mae:0.00148 eval-mae:0.09292 [293] train-mae:0.00145 eval-mae:0.09293 [294] train-mae:0.00143 eval-mae:0.09292 [295] train-mae:0.00140 eval-mae:0.09292 [296] train-mae:0.00138 eval-mae:0.09291 [297] train-mae:0.00136 eval-mae:0.09291 [298] train-mae:0.00134 eval-mae:0.09290 [299] train-mae:0.00132 eval-mae:0.09290 [300] train-mae:0.00130 eval-mae:0.09291 [301] train-mae:0.00129 eval-mae:0.09290 [302] train-mae:0.00127 eval-mae:0.09290 [303] train-mae:0.00124 eval-mae:0.09291 [304] train-mae:0.00123 eval-mae:0.09291 [305] train-mae:0.00121 eval-mae:0.09291 [306] train-mae:0.00119 eval-mae:0.09292 [307] train-mae:0.00117 eval-mae:0.09292 [308] train-mae:0.00114 eval-mae:0.09292 [309] train-mae:0.00112 eval-mae:0.09292 [310] train-mae:0.00110 eval-mae:0.09293 [311] train-mae:0.00109 eval-mae:0.09293 [312] train-mae:0.00107 eval-mae:0.09293 [313] train-mae:0.00105 eval-mae:0.09293 [314] train-mae:0.00103 eval-mae:0.09293 [315] train-mae:0.00102 eval-mae:0.09292 [316] train-mae:0.00100 eval-mae:0.09292 [317] train-mae:0.00098 eval-mae:0.09292 [318] train-mae:0.00097 eval-mae:0.09292 [319] train-mae:0.00095 eval-mae:0.09291 [320] train-mae:0.00094 eval-mae:0.09291 [321] train-mae:0.00092 eval-mae:0.09291 [322] train-mae:0.00090 eval-mae:0.09292 [323] train-mae:0.00089 eval-mae:0.09291 [324] train-mae:0.00088 eval-mae:0.09291 [325] train-mae:0.00087 eval-mae:0.09290 [326] train-mae:0.00085 eval-mae:0.09291 [327] train-mae:0.00084 eval-mae:0.09291 [328] train-mae:0.00083 eval-mae:0.09291 [329] train-mae:0.00081 eval-mae:0.09290 [330] train-mae:0.00080 eval-mae:0.09290 [331] train-mae:0.00079 eval-mae:0.09291 [332] train-mae:0.00078 eval-mae:0.09291 [333] train-mae:0.00077 eval-mae:0.09290 [334] train-mae:0.00075 eval-mae:0.09291 [335] train-mae:0.00074 eval-mae:0.09290 [336] train-mae:0.00073 eval-mae:0.09290 [337] train-mae:0.00072 eval-mae:0.09290 [338] train-mae:0.00071 eval-mae:0.09290 [339] train-mae:0.00070 eval-mae:0.09290 [340] train-mae:0.00069 eval-mae:0.09290 [341] train-mae:0.00068 eval-mae:0.09290 [342] train-mae:0.00067 eval-mae:0.09290 [343] train-mae:0.00067 eval-mae:0.09290 [344] train-mae:0.00066 eval-mae:0.09290 [345] train-mae:0.00065 eval-mae:0.09290 [346] train-mae:0.00065 eval-mae:0.09290 [347] train-mae:0.00064 eval-mae:0.09290 [348] train-mae:0.00063 eval-mae:0.09290 [349] train-mae:0.00063 eval-mae:0.09290 [350] train-mae:0.00062 eval-mae:0.09290 [351] train-mae:0.00061 eval-mae:0.09290 [352] train-mae:0.00061 eval-mae:0.09290 [353] train-mae:0.00060 eval-mae:0.09290 [354] train-mae:0.00059 eval-mae:0.09290 [355] train-mae:0.00059 eval-mae:0.09290 [356] train-mae:0.00058 eval-mae:0.09290 [357] train-mae:0.00057 eval-mae:0.09290 [358] train-mae:0.00057 eval-mae:0.09290 [359] train-mae:0.00056 eval-mae:0.09290 [360] train-mae:0.00055 eval-mae:0.09290 [361] train-mae:0.00055 eval-mae:0.09290 [362] train-mae:0.00054 eval-mae:0.09290 [363] train-mae:0.00054 eval-mae:0.09290 [364] train-mae:0.00053 eval-mae:0.09290 [365] train-mae:0.00052 eval-mae:0.09290 [366] train-mae:0.00052 eval-mae:0.09290 [367] train-mae:0.00051 eval-mae:0.09290 [368] train-mae:0.00051 eval-mae:0.09290 [369] train-mae:0.00050 eval-mae:0.09290 [370] train-mae:0.00050 eval-mae:0.09290 [371] train-mae:0.00049 eval-mae:0.09290 [372] train-mae:0.00049 eval-mae:0.09290 [373] train-mae:0.00048 eval-mae:0.09290 [374] train-mae:0.00048 eval-mae:0.09290 [375] train-mae:0.00047 eval-mae:0.09290 [376] train-mae:0.00047 eval-mae:0.09290 [377] train-mae:0.00047 eval-mae:0.09290 [378] train-mae:0.00047 eval-mae:0.09290 [379] train-mae:0.00046 eval-mae:0.09290 [380] train-mae:0.00046 eval-mae:0.09290 [381] train-mae:0.00046 eval-mae:0.09290 [382] train-mae:0.00046 eval-mae:0.09290 [383] train-mae:0.00045 eval-mae:0.09290 [384] train-mae:0.00045 eval-mae:0.09290 [385] train-mae:0.00045 eval-mae:0.09290 [386] train-mae:0.00045 eval-mae:0.09290 [387] train-mae:0.00044 eval-mae:0.09290 [388] train-mae:0.00044 eval-mae:0.09290 [389] train-mae:0.00044 eval-mae:0.09290 [390] train-mae:0.00044 eval-mae:0.09290 [391] train-mae:0.00044 eval-mae:0.09290 [392] train-mae:0.00044 eval-mae:0.09290 [393] train-mae:0.00043 eval-mae:0.09290 [394] train-mae:0.00043 eval-mae:0.09290 [395] train-mae:0.00042 eval-mae:0.09290 [396] train-mae:0.00042 eval-mae:0.09290 [397] train-mae:0.00042 eval-mae:0.09290 [398] train-mae:0.00042 eval-mae:0.09290 [399] train-mae:0.00042 eval-mae:0.09290 [400] train-mae:0.00042 eval-mae:0.09290 [401] train-mae:0.00042 eval-mae:0.09290 [402] train-mae:0.00042 eval-mae:0.09290 [403] train-mae:0.00041 eval-mae:0.09290 [404] train-mae:0.00041 eval-mae:0.09290 [405] train-mae:0.00041 eval-mae:0.09290 [406] train-mae:0.00041 eval-mae:0.09290 [407] train-mae:0.00041 eval-mae:0.09290 [408] train-mae:0.00041 eval-mae:0.09290 [409] train-mae:0.00041 eval-mae:0.09290 [410] train-mae:0.00041 eval-mae:0.09290 [411] train-mae:0.00041 eval-mae:0.09290 [412] train-mae:0.00041 eval-mae:0.09290 [413] train-mae:0.00040 eval-mae:0.09290 [414] train-mae:0.00040 eval-mae:0.09290 [415] train-mae:0.00040 eval-mae:0.09290 [416] train-mae:0.00040 eval-mae:0.09290 [417] train-mae:0.00040 eval-mae:0.09290 [418] train-mae:0.00040 eval-mae:0.09290 [419] train-mae:0.00040 eval-mae:0.09290 [420] train-mae:0.00040 eval-mae:0.09290 [421] train-mae:0.00040 eval-mae:0.09290 [422] train-mae:0.00040 eval-mae:0.09290 [423] train-mae:0.00040 eval-mae:0.09290 [424] train-mae:0.00040 eval-mae:0.09290 [425] train-mae:0.00040 eval-mae:0.09290 [426] train-mae:0.00040 eval-mae:0.09290 [427] train-mae:0.00039 eval-mae:0.09290 [428] train-mae:0.00039 eval-mae:0.09290 [429] train-mae:0.00039 eval-mae:0.09290 [430] train-mae:0.00039 eval-mae:0.09290 [431] train-mae:0.00039 eval-mae:0.09290 [432] train-mae:0.00039 eval-mae:0.09290 [433] train-mae:0.00039 eval-mae:0.09290 [434] train-mae:0.00039 eval-mae:0.09290 [435] train-mae:0.00039 eval-mae:0.09290 [436] train-mae:0.00039 eval-mae:0.09290 [437] train-mae:0.00039 eval-mae:0.09290 [438] train-mae:0.00039 eval-mae:0.09290 [439] train-mae:0.00039 eval-mae:0.09290 [440] train-mae:0.00039 eval-mae:0.09290 [441] train-mae:0.00039 eval-mae:0.09290 [442] train-mae:0.00039 eval-mae:0.09290 [443] train-mae:0.00039 eval-mae:0.09290 [444] train-mae:0.00039 eval-mae:0.09290 [445] train-mae:0.00038 eval-mae:0.09290 [446] train-mae:0.00038 eval-mae:0.09290 [447] train-mae:0.00038 eval-mae:0.09290 [448] train-mae:0.00038 eval-mae:0.09290 [449] train-mae:0.00038 eval-mae:0.09290 [450] train-mae:0.00038 eval-mae:0.09290 [451] train-mae:0.00038 eval-mae:0.09290 [452] train-mae:0.00038 eval-mae:0.09290 [453] train-mae:0.00038 eval-mae:0.09290 [454] train-mae:0.00038 eval-mae:0.09290 [455] train-mae:0.00038 eval-mae:0.09290 [456] train-mae:0.00038 eval-mae:0.09290 [457] train-mae:0.00038 eval-mae:0.09290 [458] train-mae:0.00038 eval-mae:0.09290 [459] train-mae:0.00038 eval-mae:0.09290 [460] train-mae:0.00038 eval-mae:0.09290 [461] train-mae:0.00038 eval-mae:0.09290 [462] train-mae:0.00038 eval-mae:0.09290 [463] train-mae:0.00038 eval-mae:0.09290 [464] train-mae:0.00038 eval-mae:0.09290 [465] train-mae:0.00038 eval-mae:0.09290 [466] train-mae:0.00038 eval-mae:0.09290 [467] train-mae:0.00038 eval-mae:0.09290 [468] train-mae:0.00038 eval-mae:0.09290 [469] train-mae:0.00038 eval-mae:0.09290 [470] train-mae:0.00038 eval-mae:0.09290 [471] train-mae:0.00038 eval-mae:0.09290 [472] train-mae:0.00038 eval-mae:0.09290 [473] train-mae:0.00038 eval-mae:0.09290 [474] train-mae:0.00038 eval-mae:0.09290 [475] train-mae:0.00038 eval-mae:0.09290 [476] train-mae:0.00038 eval-mae:0.09290 [477] train-mae:0.00038 eval-mae:0.09290 [478] train-mae:0.00038 eval-mae:0.09290 [479] train-mae:0.00038 eval-mae:0.09290 [480] train-mae:0.00038 eval-mae:0.09290 [481] train-mae:0.00038 eval-mae:0.09290 [482] train-mae:0.00038 eval-mae:0.09290 [483] train-mae:0.00038 eval-mae:0.09290 [484] train-mae:0.00038 eval-mae:0.09290 [485] train-mae:0.00038 eval-mae:0.09290 [486] train-mae:0.00037 eval-mae:0.09290 [487] train-mae:0.00037 eval-mae:0.09290 [488] train-mae:0.00037 eval-mae:0.09290 [489] train-mae:0.00037 eval-mae:0.09290 [490] train-mae:0.00037 eval-mae:0.09290 [491] train-mae:0.00037 eval-mae:0.09290 [492] train-mae:0.00037 eval-mae:0.09290 [493] train-mae:0.00037 eval-mae:0.09290 [494] train-mae:0.00037 eval-mae:0.09290 [495] train-mae:0.00037 eval-mae:0.09290 [496] train-mae:0.00037 eval-mae:0.09290 [497] train-mae:0.00037 eval-mae:0.09290 [498] train-mae:0.00037 eval-mae:0.09290 [499] train-mae:0.00037 eval-mae:0.09290 [500] train-mae:0.00037 eval-mae:0.09290 [501] train-mae:0.00037 eval-mae:0.09290 [502] train-mae:0.00037 eval-mae:0.09290 [503] train-mae:0.00037 eval-mae:0.09290 [504] train-mae:0.00037 eval-mae:0.09290 [505] train-mae:0.00037 eval-mae:0.09290 [506] train-mae:0.00037 eval-mae:0.09290 [507] train-mae:0.00037 eval-mae:0.09290 [508] train-mae:0.00037 eval-mae:0.09290 [509] train-mae:0.00037 eval-mae:0.09290 [510] train-mae:0.00037 eval-mae:0.09290 [511] train-mae:0.00037 eval-mae:0.09290 [512] train-mae:0.00037 eval-mae:0.09290 [513] train-mae:0.00037 eval-mae:0.09290 [514] train-mae:0.00037 eval-mae:0.09290 [515] train-mae:0.00037 eval-mae:0.09290 [516] train-mae:0.00037 eval-mae:0.09290 [517] train-mae:0.00037 eval-mae:0.09290 [518] train-mae:0.00037 eval-mae:0.09290 [519] train-mae:0.00037 eval-mae:0.09290 [520] train-mae:0.00037 eval-mae:0.09290 [521] train-mae:0.00037 eval-mae:0.09290 [522] train-mae:0.00037 eval-mae:0.09290 [523] train-mae:0.00037 eval-mae:0.09290 [524] train-mae:0.00037 eval-mae:0.09290 [525] train-mae:0.00037 eval-mae:0.09290 [526] train-mae:0.00037 eval-mae:0.09290 [527] train-mae:0.00037 eval-mae:0.09290 [528] train-mae:0.00037 eval-mae:0.09290 [529] train-mae:0.00037 eval-mae:0.09290 [530] train-mae:0.00037 eval-mae:0.09290 [531] train-mae:0.00037 eval-mae:0.09290 [532] train-mae:0.00037 eval-mae:0.09290 [533] train-mae:0.00036 eval-mae:0.09290 [534] train-mae:0.00036 eval-mae:0.09290 [535] train-mae:0.00036 eval-mae:0.09290 [536] train-mae:0.00036 eval-mae:0.09290 [537] train-mae:0.00036 eval-mae:0.09290 [538] train-mae:0.00036 eval-mae:0.09290 [539] train-mae:0.00036 eval-mae:0.09290 [540] train-mae:0.00036 eval-mae:0.09290 [541] train-mae:0.00036 eval-mae:0.09290 [542] train-mae:0.00036 eval-mae:0.09290 [543] train-mae:0.00036 eval-mae:0.09290 [544] train-mae:0.00036 eval-mae:0.09290 [545] train-mae:0.00036 eval-mae:0.09290 [546] train-mae:0.00036 eval-mae:0.09290 [547] train-mae:0.00036 eval-mae:0.09290 [548] train-mae:0.00036 eval-mae:0.09290 [549] train-mae:0.00036 eval-mae:0.09290 [550] train-mae:0.00036 eval-mae:0.09290 [551] train-mae:0.00036 eval-mae:0.09290 [552] train-mae:0.00036 eval-mae:0.09290 [553] train-mae:0.00036 eval-mae:0.09290 [554] train-mae:0.00036 eval-mae:0.09290 [555] train-mae:0.00036 eval-mae:0.09290 [556] train-mae:0.00036 eval-mae:0.09290 [557] train-mae:0.00036 eval-mae:0.09290 [558] train-mae:0.00036 eval-mae:0.09290 [559] train-mae:0.00036 eval-mae:0.09290 [560] train-mae:0.00036 eval-mae:0.09290 [561] train-mae:0.00036 eval-mae:0.09290 [562] train-mae:0.00036 eval-mae:0.09290 [563] train-mae:0.00036 eval-mae:0.09290 [564] train-mae:0.00036 eval-mae:0.09290
模型训练完成后,就可以对测试集进行预测了。由于前期我们已经对测试集进行了预处理,此处不需要再进行处理了,当模型应用于线上数据时,需要先将线上真实数据预处理,然后再使用模型预测。
x_pred = gbm.predict(xgb.DMatrix(X_test))
由于之前我们对SalesPrice
做了对数化处理,这里我们需要将其指数化进行还原
pred_price = np.expm1(x_pred)
test['SalePrice'] = pred_price
test[['Id', 'SalePrice']]
Id | SalePrice | |
---|---|---|
1454 | 1461 | 125133.742188 |
1455 | 1462 | 166289.421875 |
1456 | 1463 | 187760.562500 |
1457 | 1464 | 195140.906250 |
1458 | 1465 | 188690.578125 |
... | ... | ... |
2908 | 2915 | 82035.882812 |
2909 | 2916 | 85035.984375 |
2910 | 2917 | 163848.312500 |
2911 | 2918 | 113641.289062 |
2912 | 2919 | 224657.375000 |
1459 rows × 2 columns
最后将模型保存为json即可
gbm.save_model('./model/model.json')
模型保存后,下次调用时则无需再次训练,直接从保存的文件中加载即可
local_model = xgb.Booster(model_file='./model/model.json')