{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import xgboost as xgb\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "train_data = pd.read_csv('./data/train.csv')\n", "test_data = pd.read_csv('./data/test.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "train_data.drop(train_data[(train_data[\"GrLivArea\"]>4000)&(train_data[\"SalePrice\"]<300000)].index,inplace=True)#pandas 里面的条件索引" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2917, 81)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data = pd.concat([train_data, test_data]).reset_index(drop=True)\n", "all_data.shape" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
特征名称缺失率
0PoolQC0.995885
1MiscFeature0.962963
2Alley0.937586
3Fence0.807270
4FireplaceQu0.473251
5LotFrontage0.177641
6GarageYrBlt0.055556
7GarageCond0.055556
8GarageType0.055556
9GarageFinish0.055556
10GarageQual0.055556
11BsmtFinType20.026063
12BsmtExposure0.026063
13BsmtQual0.025377
14BsmtCond0.025377
15BsmtFinType10.025377
16MasVnrArea0.005487
17MasVnrType0.005487
18Electrical0.000686
\n", "
" ], "text/plain": [ " 特征名称 缺失率\n", "0 PoolQC 0.995885\n", "1 MiscFeature 0.962963\n", "2 Alley 0.937586\n", "3 Fence 0.807270\n", "4 FireplaceQu 0.473251\n", "5 LotFrontage 0.177641\n", "6 GarageYrBlt 0.055556\n", "7 GarageCond 0.055556\n", "8 GarageType 0.055556\n", "9 GarageFinish 0.055556\n", "10 GarageQual 0.055556\n", "11 BsmtFinType2 0.026063\n", "12 BsmtExposure 0.026063\n", "13 BsmtQual 0.025377\n", "14 BsmtCond 0.025377\n", "15 BsmtFinType1 0.025377\n", "16 MasVnrArea 0.005487\n", "17 MasVnrType 0.005487\n", "18 Electrical 0.000686" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "miss_value = train_data.isnull().sum().sort_values(ascending=False).to_frame().reset_index()\n", "miss_value.columns = ['feature', 'miss_per']\n", "miss_value = miss_value[miss_value.miss_per > 0]\n", "miss_value.miss_per = miss_value.miss_per / train_data.shape[0]\n", "miss_value.columns = ['特征名称', '缺失率']\n", "miss_value" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Id 0\n", "Foundation 0\n", "Heating 0\n", "SaleCondition 0\n", "CentralAir 0\n", " ... \n", "SalePrice 1459\n", "Fence 2346\n", "Alley 2719\n", "MiscFeature 2812\n", "PoolQC 2908\n", "Length: 81, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "miss = all_data.isnull().sum().sort_values(ascending=True)\n", "miss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "删除缺失比例过高的列" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alley\n", "FireplaceQu\n", "PoolQC\n", "Fence\n", "MiscFeature\n" ] }, { "data": { "text/plain": [ "(2917, 76)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_cols = [x for x in all_data.columns if x != 'Id' and x != 'SalePrice']\n", "for col in all_cols:\n", " if miss[col] > 1000:\n", " print(col)\n", " all_data.drop(columns=[col], inplace=True)\n", "all_data.shape" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "from scipy.stats import norm\n", "from scipy import stats" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "image/svg+xml": "\n\n\n \n \n \n \n 2022-07-27T14:10:04.575558\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sns.distplot(train_data.SalePrice, fit=norm)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "image/png": "", "image/svg+xml": "\n\n\n \n \n \n \n 2022-07-27T14:10:13.679452\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "rest = stats.probplot(train_data.SalePrice, plot=plt)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "image/svg+xml": "\n\n\n \n \n \n \n 2022-07-27T14:17:35.675100\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sns.distplot(np.log1p(train_data.SalePrice), fit=norm)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "image/png": "", "image/svg+xml": "\n\n\n \n \n \n \n 2022-07-27T14:17:39.713631\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "rest = stats.probplot(np.log1p(train_data.SalePrice), plot=plt)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0, 76)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data[all_data['GarageYrBlt'].isna()].shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "na_index = all_data[all_data['GarageYrBlt'] > 2022].index\n", "all_data.loc[na_index, 'GarageYrBlt'] = None" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(160, 76)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data[all_data['GarageYrBlt'].isna()].shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2917, 76)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data.GarageYrBlt.fillna(all_data.YearBuilt, inplace=True)\n", "year_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']\n", "for col in year_cols:\n", " all_data[col] = 2022 - all_data[col]\n", "all_data.shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2917, 76)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols1 = [\"GarageQual\", \"GarageCond\", \"GarageFinish\", \"GarageType\", \"BsmtExposure\", \"BsmtCond\", \"BsmtQual\", \"BsmtFinType2\", \"BsmtFinType1\", \"MasVnrType\"]\n", "for col in cols1:\n", " all_data[col].fillna(\"None\",inplace=True)\n", "all_data.shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2917, 76)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols2=[\"MasVnrArea\", \"BsmtUnfSF\", \"TotalBsmtSF\", \"GarageCars\", \"BsmtFinSF2\", \"BsmtFinSF1\", \"GarageArea\"]\n", "for col in cols2:\n", " all_data[col].fillna(0, inplace=True)\n", "all_data.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2917, 76)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_data[\"LotFrontage\"].fillna(np.mean(all_data[\"LotFrontage\"]),inplace=True)\n", "cols3 = [\"MSZoning\", \"BsmtFullBath\", \"BsmtHalfBath\", \"Utilities\", \"Functional\", \"Electrical\", \"KitchenQual\", \"SaleType\",\"Exterior1st\", \"Exterior2nd\"]\n", "for col in cols3:\n", " all_data[col].fillna(all_data[col].mode()[0], inplace=True)\n", "all_data.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "numeric_cols = [x for x in all_data.select_dtypes(exclude=['object']).columns.tolist() if x != 'Id' and x != 'SalePrice']\n", "object_cols = [x for x in all_data.select_dtypes(include=['object']).columns.tolist()]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "for col in numeric_cols:\n", " all_data[col] = np.log1p(all_data[col])\n", " all_data[col] = (all_data[col] - all_data[col].min()) / (all_data[col].max() - all_data[col].min())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "dataset = pd.get_dummies(all_data, columns=object_cols)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "dataset.SalePrice = np.log1p(dataset.SalePrice)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1458, 280)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = dataset[~dataset.SalePrice.isna()].copy()\n", "train.shape" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1459, 280)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = dataset[dataset.SalePrice.isna()].copy()\n", "test.shape" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "feature_cols = [x for x in dataset.columns if x != 'Id' and x != 'SalePrice']" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "train, valid = train_test_split(train, test_size=0.12, shuffle=True, random_state=42)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "X_train, Y_train = train[feature_cols], train['SalePrice']\n", "X_valid, Y_valid = valid[feature_cols], valid['SalePrice']\n", "X_test, Y_test = test[feature_cols], test['SalePrice']" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "dtrain = xgb.DMatrix(X_train, Y_train)\n", "dvalid = xgb.DMatrix(X_valid, Y_valid)\n", "watchlist = [(dtrain, 'train'), (dvalid, 'eval')]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "params = {'objective': 'reg:squarederror', \n", " 'booster': 'gbtree', \n", " 'eta': 0.05,\n", " 'max_depth': 15, \n", " 'subsample': 0.7, \n", " 'colsample_bytree': 0.7,\n", " 'eval_metric':['rmse'],\n", " 'silent': 1, \n", " 'seed': 10} \n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10:33:47] WARNING: ../src/learner.cc:627: \n", "Parameters: { \"silent\" } might not be used.\n", "\n", " This could be a false alarm, with some parameters getting used by language bindings but\n", " then being mistakenly passed down to XGBoost core, or some parameter actually being used\n", " but getting flagged wrongly here. Please open an issue if you find any such cases.\n", "\n", "\n", "[0]\ttrain-rmse:10.95491\teval-rmse:10.96235\n", "[1]\ttrain-rmse:10.40916\teval-rmse:10.41661\n", "[2]\ttrain-rmse:9.89034\teval-rmse:9.89780\n", "[3]\ttrain-rmse:9.39722\teval-rmse:9.40469\n", "[4]\ttrain-rmse:8.92885\teval-rmse:8.93633\n", "[5]\ttrain-rmse:8.48375\teval-rmse:8.49124\n", "[6]\ttrain-rmse:8.06123\teval-rmse:8.06873\n", "[7]\ttrain-rmse:7.66021\teval-rmse:7.66773\n", "[8]\ttrain-rmse:7.27851\teval-rmse:7.28504\n", "[9]\ttrain-rmse:6.91608\teval-rmse:6.92262\n", "[10]\ttrain-rmse:6.57212\teval-rmse:6.57776\n", "[11]\ttrain-rmse:6.24453\teval-rmse:6.24978\n", "[12]\ttrain-rmse:5.93355\teval-rmse:5.93791\n", "[13]\ttrain-rmse:5.63820\teval-rmse:5.64171\n", "[14]\ttrain-rmse:5.35791\teval-rmse:5.36060\n", "[15]\ttrain-rmse:5.09149\teval-rmse:5.09384\n", "[16]\ttrain-rmse:4.83796\teval-rmse:4.84034\n", "[17]\ttrain-rmse:4.59742\teval-rmse:4.59968\n", "[18]\ttrain-rmse:4.36846\teval-rmse:4.37007\n", "[19]\ttrain-rmse:4.15155\teval-rmse:4.15304\n", "[20]\ttrain-rmse:3.94554\teval-rmse:3.94632\n", "[21]\ttrain-rmse:3.74977\teval-rmse:3.74999\n", "[22]\ttrain-rmse:3.56360\teval-rmse:3.56321\n", "[23]\ttrain-rmse:3.38712\teval-rmse:3.38680\n", "[24]\ttrain-rmse:3.21874\teval-rmse:3.21847\n", "[25]\ttrain-rmse:3.05978\teval-rmse:3.05902\n", "[26]\ttrain-rmse:2.90867\teval-rmse:2.90743\n", "[27]\ttrain-rmse:2.76500\teval-rmse:2.76388\n", "[28]\ttrain-rmse:2.62812\teval-rmse:2.62685\n", "[29]\ttrain-rmse:2.49820\teval-rmse:2.49628\n", "[30]\ttrain-rmse:2.37453\teval-rmse:2.37196\n", "[31]\ttrain-rmse:2.25692\teval-rmse:2.25370\n", "[32]\ttrain-rmse:2.14536\teval-rmse:2.14147\n", "[33]\ttrain-rmse:2.03937\teval-rmse:2.03521\n", "[34]\ttrain-rmse:1.93883\teval-rmse:1.93448\n", "[35]\ttrain-rmse:1.84381\teval-rmse:1.83979\n", "[36]\ttrain-rmse:1.75285\teval-rmse:1.74887\n", "[37]\ttrain-rmse:1.66676\teval-rmse:1.66205\n", "[38]\ttrain-rmse:1.58492\teval-rmse:1.57965\n", "[39]\ttrain-rmse:1.50715\teval-rmse:1.50159\n", "[40]\ttrain-rmse:1.43321\teval-rmse:1.42713\n", "[41]\ttrain-rmse:1.36283\teval-rmse:1.35596\n", "[42]\ttrain-rmse:1.29620\teval-rmse:1.28879\n", "[43]\ttrain-rmse:1.23316\teval-rmse:1.22663\n", "[44]\ttrain-rmse:1.17272\teval-rmse:1.16596\n", "[45]\ttrain-rmse:1.11549\teval-rmse:1.10860\n", "[46]\ttrain-rmse:1.06120\teval-rmse:1.05444\n", "[47]\ttrain-rmse:1.00958\teval-rmse:1.00254\n", "[48]\ttrain-rmse:0.96067\teval-rmse:0.95273\n", "[49]\ttrain-rmse:0.91434\teval-rmse:0.90591\n", "[50]\ttrain-rmse:0.87015\teval-rmse:0.86133\n", "[51]\ttrain-rmse:0.82834\teval-rmse:0.81927\n", "[52]\ttrain-rmse:0.78870\teval-rmse:0.77968\n", "[53]\ttrain-rmse:0.75082\teval-rmse:0.74161\n", "[54]\ttrain-rmse:0.71492\teval-rmse:0.70547\n", "[55]\ttrain-rmse:0.68106\teval-rmse:0.67230\n", "[56]\ttrain-rmse:0.64849\teval-rmse:0.63960\n", "[57]\ttrain-rmse:0.61769\teval-rmse:0.60831\n", "[58]\ttrain-rmse:0.58868\teval-rmse:0.57939\n", "[59]\ttrain-rmse:0.56057\teval-rmse:0.55152\n", "[60]\ttrain-rmse:0.53451\teval-rmse:0.52523\n", "[61]\ttrain-rmse:0.50950\teval-rmse:0.50035\n", "[62]\ttrain-rmse:0.48564\teval-rmse:0.47651\n", "[63]\ttrain-rmse:0.46293\teval-rmse:0.45377\n", "[64]\ttrain-rmse:0.44159\teval-rmse:0.43343\n", "[65]\ttrain-rmse:0.42131\teval-rmse:0.41326\n", "[66]\ttrain-rmse:0.40179\teval-rmse:0.39410\n", "[67]\ttrain-rmse:0.38364\teval-rmse:0.37700\n", "[68]\ttrain-rmse:0.36614\teval-rmse:0.36009\n", "[69]\ttrain-rmse:0.34965\teval-rmse:0.34418\n", "[70]\ttrain-rmse:0.33389\teval-rmse:0.32955\n", "[71]\ttrain-rmse:0.31898\teval-rmse:0.31511\n", "[72]\ttrain-rmse:0.30487\teval-rmse:0.30192\n", "[73]\ttrain-rmse:0.29146\teval-rmse:0.28948\n", "[74]\ttrain-rmse:0.27854\teval-rmse:0.27745\n", "[75]\ttrain-rmse:0.26624\teval-rmse:0.26603\n", "[76]\ttrain-rmse:0.25467\teval-rmse:0.25535\n", "[77]\ttrain-rmse:0.24384\teval-rmse:0.24510\n", "[78]\ttrain-rmse:0.23341\teval-rmse:0.23538\n", "[79]\ttrain-rmse:0.22357\teval-rmse:0.22674\n", "[80]\ttrain-rmse:0.21429\teval-rmse:0.21868\n", "[81]\ttrain-rmse:0.20526\teval-rmse:0.21073\n", "[82]\ttrain-rmse:0.19662\teval-rmse:0.20326\n", "[83]\ttrain-rmse:0.18837\teval-rmse:0.19614\n", "[84]\ttrain-rmse:0.18054\teval-rmse:0.18948\n", "[85]\ttrain-rmse:0.17345\teval-rmse:0.18387\n", "[86]\ttrain-rmse:0.16646\teval-rmse:0.17787\n", "[87]\ttrain-rmse:0.15977\teval-rmse:0.17240\n", "[88]\ttrain-rmse:0.15350\teval-rmse:0.16762\n", "[89]\ttrain-rmse:0.14754\teval-rmse:0.16333\n", "[90]\ttrain-rmse:0.14182\teval-rmse:0.15882\n", "[91]\ttrain-rmse:0.13632\teval-rmse:0.15475\n", "[92]\ttrain-rmse:0.13127\teval-rmse:0.15126\n", "[93]\ttrain-rmse:0.12620\teval-rmse:0.14789\n", "[94]\ttrain-rmse:0.12159\teval-rmse:0.14519\n", "[95]\ttrain-rmse:0.11702\teval-rmse:0.14218\n", "[96]\ttrain-rmse:0.11266\teval-rmse:0.13953\n", "[97]\ttrain-rmse:0.10853\teval-rmse:0.13714\n", "[98]\ttrain-rmse:0.10450\teval-rmse:0.13514\n", "[99]\ttrain-rmse:0.10078\teval-rmse:0.13347\n", "[100]\ttrain-rmse:0.09716\teval-rmse:0.13144\n", "[101]\ttrain-rmse:0.09377\teval-rmse:0.12970\n", "[102]\ttrain-rmse:0.09061\teval-rmse:0.12809\n", "[103]\ttrain-rmse:0.08744\teval-rmse:0.12667\n", "[104]\ttrain-rmse:0.08450\teval-rmse:0.12523\n", "[105]\ttrain-rmse:0.08152\teval-rmse:0.12383\n", "[106]\ttrain-rmse:0.07869\teval-rmse:0.12271\n", "[107]\ttrain-rmse:0.07611\teval-rmse:0.12161\n", "[108]\ttrain-rmse:0.07358\teval-rmse:0.12084\n", "[109]\ttrain-rmse:0.07116\teval-rmse:0.11998\n", "[110]\ttrain-rmse:0.06895\teval-rmse:0.11904\n", "[111]\ttrain-rmse:0.06676\teval-rmse:0.11830\n", "[112]\ttrain-rmse:0.06457\teval-rmse:0.11761\n", "[113]\ttrain-rmse:0.06251\teval-rmse:0.11679\n", "[114]\ttrain-rmse:0.06071\teval-rmse:0.11642\n", "[115]\ttrain-rmse:0.05873\teval-rmse:0.11584\n", "[116]\ttrain-rmse:0.05691\teval-rmse:0.11509\n", "[117]\ttrain-rmse:0.05539\teval-rmse:0.11460\n", "[118]\ttrain-rmse:0.05374\teval-rmse:0.11408\n", "[119]\ttrain-rmse:0.05229\teval-rmse:0.11369\n", "[120]\ttrain-rmse:0.05087\teval-rmse:0.11348\n", "[121]\ttrain-rmse:0.04938\teval-rmse:0.11326\n", "[122]\ttrain-rmse:0.04790\teval-rmse:0.11283\n", "[123]\ttrain-rmse:0.04652\teval-rmse:0.11271\n", "[124]\ttrain-rmse:0.04506\teval-rmse:0.11234\n", "[125]\ttrain-rmse:0.04385\teval-rmse:0.11213\n", "[126]\ttrain-rmse:0.04264\teval-rmse:0.11208\n", "[127]\ttrain-rmse:0.04140\teval-rmse:0.11193\n", "[128]\ttrain-rmse:0.04036\teval-rmse:0.11187\n", "[129]\ttrain-rmse:0.03931\teval-rmse:0.11160\n", "[130]\ttrain-rmse:0.03824\teval-rmse:0.11150\n", "[131]\ttrain-rmse:0.03722\teval-rmse:0.11131\n", "[132]\ttrain-rmse:0.03628\teval-rmse:0.11130\n", "[133]\ttrain-rmse:0.03530\teval-rmse:0.11123\n", "[134]\ttrain-rmse:0.03441\teval-rmse:0.11112\n", "[135]\ttrain-rmse:0.03345\teval-rmse:0.11104\n", "[136]\ttrain-rmse:0.03262\teval-rmse:0.11096\n", "[137]\ttrain-rmse:0.03188\teval-rmse:0.11098\n", "[138]\ttrain-rmse:0.03105\teval-rmse:0.11097\n", "[139]\ttrain-rmse:0.03025\teval-rmse:0.11102\n", "[140]\ttrain-rmse:0.02952\teval-rmse:0.11110\n", "[141]\ttrain-rmse:0.02890\teval-rmse:0.11103\n", "[142]\ttrain-rmse:0.02824\teval-rmse:0.11104\n", "[143]\ttrain-rmse:0.02761\teval-rmse:0.11102\n", "[144]\ttrain-rmse:0.02702\teval-rmse:0.11100\n", "[145]\ttrain-rmse:0.02634\teval-rmse:0.11108\n", "[146]\ttrain-rmse:0.02584\teval-rmse:0.11106\n", "[147]\ttrain-rmse:0.02540\teval-rmse:0.11111\n", "[148]\ttrain-rmse:0.02489\teval-rmse:0.11130\n", "[149]\ttrain-rmse:0.02439\teval-rmse:0.11131\n", "[150]\ttrain-rmse:0.02382\teval-rmse:0.11130\n", "[151]\ttrain-rmse:0.02333\teval-rmse:0.11134\n", "[152]\ttrain-rmse:0.02277\teval-rmse:0.11133\n", "[153]\ttrain-rmse:0.02238\teval-rmse:0.11135\n", "[154]\ttrain-rmse:0.02189\teval-rmse:0.11143\n", "[155]\ttrain-rmse:0.02146\teval-rmse:0.11156\n", "[156]\ttrain-rmse:0.02101\teval-rmse:0.11152\n", "[157]\ttrain-rmse:0.02058\teval-rmse:0.11150\n", "[158]\ttrain-rmse:0.02017\teval-rmse:0.11143\n", "[159]\ttrain-rmse:0.01975\teval-rmse:0.11141\n", "[160]\ttrain-rmse:0.01932\teval-rmse:0.11136\n", "[161]\ttrain-rmse:0.01901\teval-rmse:0.11136\n", "[162]\ttrain-rmse:0.01860\teval-rmse:0.11142\n", "[163]\ttrain-rmse:0.01820\teval-rmse:0.11150\n", "[164]\ttrain-rmse:0.01792\teval-rmse:0.11156\n", "[165]\ttrain-rmse:0.01758\teval-rmse:0.11161\n", "[166]\ttrain-rmse:0.01725\teval-rmse:0.11173\n", "[167]\ttrain-rmse:0.01694\teval-rmse:0.11173\n", "[168]\ttrain-rmse:0.01661\teval-rmse:0.11172\n", "[169]\ttrain-rmse:0.01629\teval-rmse:0.11181\n", "[170]\ttrain-rmse:0.01602\teval-rmse:0.11185\n", "[171]\ttrain-rmse:0.01574\teval-rmse:0.11181\n", "[172]\ttrain-rmse:0.01544\teval-rmse:0.11183\n", "[173]\ttrain-rmse:0.01520\teval-rmse:0.11179\n", "[174]\ttrain-rmse:0.01489\teval-rmse:0.11181\n", "[175]\ttrain-rmse:0.01463\teval-rmse:0.11181\n", "[176]\ttrain-rmse:0.01435\teval-rmse:0.11179\n", "[177]\ttrain-rmse:0.01409\teval-rmse:0.11177\n", "[178]\ttrain-rmse:0.01373\teval-rmse:0.11180\n", "[179]\ttrain-rmse:0.01350\teval-rmse:0.11181\n", "[180]\ttrain-rmse:0.01327\teval-rmse:0.11180\n", "[181]\ttrain-rmse:0.01304\teval-rmse:0.11185\n", "[182]\ttrain-rmse:0.01279\teval-rmse:0.11187\n", "[183]\ttrain-rmse:0.01256\teval-rmse:0.11186\n", "[184]\ttrain-rmse:0.01232\teval-rmse:0.11188\n", "[185]\ttrain-rmse:0.01211\teval-rmse:0.11191\n", "[186]\ttrain-rmse:0.01186\teval-rmse:0.11187\n", "[187]\ttrain-rmse:0.01172\teval-rmse:0.11188\n", "[188]\ttrain-rmse:0.01150\teval-rmse:0.11201\n", "[189]\ttrain-rmse:0.01133\teval-rmse:0.11203\n", "[190]\ttrain-rmse:0.01110\teval-rmse:0.11207\n", "[191]\ttrain-rmse:0.01092\teval-rmse:0.11210\n", "[192]\ttrain-rmse:0.01075\teval-rmse:0.11209\n", "[193]\ttrain-rmse:0.01057\teval-rmse:0.11205\n", "[194]\ttrain-rmse:0.01042\teval-rmse:0.11211\n", "[195]\ttrain-rmse:0.01025\teval-rmse:0.11215\n", "[196]\ttrain-rmse:0.01008\teval-rmse:0.11213\n", "[197]\ttrain-rmse:0.00993\teval-rmse:0.11216\n", "[198]\ttrain-rmse:0.00973\teval-rmse:0.11215\n", "[199]\ttrain-rmse:0.00959\teval-rmse:0.11218\n", "[200]\ttrain-rmse:0.00946\teval-rmse:0.11218\n", "[201]\ttrain-rmse:0.00929\teval-rmse:0.11218\n", "[202]\ttrain-rmse:0.00911\teval-rmse:0.11218\n", "[203]\ttrain-rmse:0.00896\teval-rmse:0.11220\n", "[204]\ttrain-rmse:0.00884\teval-rmse:0.11217\n", "[205]\ttrain-rmse:0.00872\teval-rmse:0.11216\n", "[206]\ttrain-rmse:0.00861\teval-rmse:0.11219\n", "[207]\ttrain-rmse:0.00844\teval-rmse:0.11218\n", "[208]\ttrain-rmse:0.00830\teval-rmse:0.11227\n", "[209]\ttrain-rmse:0.00819\teval-rmse:0.11229\n", "[210]\ttrain-rmse:0.00809\teval-rmse:0.11230\n", "[211]\ttrain-rmse:0.00800\teval-rmse:0.11231\n", "[212]\ttrain-rmse:0.00783\teval-rmse:0.11234\n", "[213]\ttrain-rmse:0.00772\teval-rmse:0.11234\n", "[214]\ttrain-rmse:0.00762\teval-rmse:0.11232\n", "[215]\ttrain-rmse:0.00747\teval-rmse:0.11235\n", "[216]\ttrain-rmse:0.00734\teval-rmse:0.11236\n", "[217]\ttrain-rmse:0.00723\teval-rmse:0.11240\n", "[218]\ttrain-rmse:0.00709\teval-rmse:0.11241\n", "[219]\ttrain-rmse:0.00697\teval-rmse:0.11240\n", "[220]\ttrain-rmse:0.00687\teval-rmse:0.11242\n", "[221]\ttrain-rmse:0.00680\teval-rmse:0.11245\n", "[222]\ttrain-rmse:0.00667\teval-rmse:0.11250\n", "[223]\ttrain-rmse:0.00658\teval-rmse:0.11254\n", "[224]\ttrain-rmse:0.00647\teval-rmse:0.11255\n", "[225]\ttrain-rmse:0.00639\teval-rmse:0.11258\n", "[226]\ttrain-rmse:0.00627\teval-rmse:0.11257\n", "[227]\ttrain-rmse:0.00616\teval-rmse:0.11256\n", "[228]\ttrain-rmse:0.00605\teval-rmse:0.11257\n", "[229]\ttrain-rmse:0.00595\teval-rmse:0.11261\n", "[230]\ttrain-rmse:0.00583\teval-rmse:0.11262\n", "[231]\ttrain-rmse:0.00577\teval-rmse:0.11264\n", "[232]\ttrain-rmse:0.00566\teval-rmse:0.11263\n", "[233]\ttrain-rmse:0.00558\teval-rmse:0.11263\n", "[234]\ttrain-rmse:0.00552\teval-rmse:0.11264\n", "[235]\ttrain-rmse:0.00543\teval-rmse:0.11264\n", "[236]\ttrain-rmse:0.00536\teval-rmse:0.11265\n", "[237]\ttrain-rmse:0.00530\teval-rmse:0.11266\n", "[238]\ttrain-rmse:0.00524\teval-rmse:0.11267\n", "[239]\ttrain-rmse:0.00513\teval-rmse:0.11265\n", "[240]\ttrain-rmse:0.00505\teval-rmse:0.11265\n", "[241]\ttrain-rmse:0.00497\teval-rmse:0.11265\n", "[242]\ttrain-rmse:0.00488\teval-rmse:0.11264\n", "[243]\ttrain-rmse:0.00481\teval-rmse:0.11265\n", "[244]\ttrain-rmse:0.00472\teval-rmse:0.11266\n", "[245]\ttrain-rmse:0.00465\teval-rmse:0.11267\n", "[246]\ttrain-rmse:0.00461\teval-rmse:0.11266\n", "[247]\ttrain-rmse:0.00453\teval-rmse:0.11265\n", "[248]\ttrain-rmse:0.00445\teval-rmse:0.11265\n", "[249]\ttrain-rmse:0.00439\teval-rmse:0.11266\n", "[250]\ttrain-rmse:0.00431\teval-rmse:0.11266\n", "[251]\ttrain-rmse:0.00425\teval-rmse:0.11267\n", "[252]\ttrain-rmse:0.00417\teval-rmse:0.11268\n", "[253]\ttrain-rmse:0.00411\teval-rmse:0.11269\n", "[254]\ttrain-rmse:0.00404\teval-rmse:0.11268\n", "[255]\ttrain-rmse:0.00399\teval-rmse:0.11269\n", "[256]\ttrain-rmse:0.00391\teval-rmse:0.11270\n", "[257]\ttrain-rmse:0.00385\teval-rmse:0.11270\n", "[258]\ttrain-rmse:0.00379\teval-rmse:0.11272\n", "[259]\ttrain-rmse:0.00372\teval-rmse:0.11272\n", "[260]\ttrain-rmse:0.00367\teval-rmse:0.11271\n", "[261]\ttrain-rmse:0.00360\teval-rmse:0.11271\n", "[262]\ttrain-rmse:0.00355\teval-rmse:0.11272\n", "[263]\ttrain-rmse:0.00349\teval-rmse:0.11272\n", "[264]\ttrain-rmse:0.00342\teval-rmse:0.11273\n", "[265]\ttrain-rmse:0.00337\teval-rmse:0.11272\n", "[266]\ttrain-rmse:0.00333\teval-rmse:0.11272\n", "[267]\ttrain-rmse:0.00328\teval-rmse:0.11273\n", "[268]\ttrain-rmse:0.00324\teval-rmse:0.11274\n", "[269]\ttrain-rmse:0.00319\teval-rmse:0.11272\n", "[270]\ttrain-rmse:0.00313\teval-rmse:0.11272\n", "[271]\ttrain-rmse:0.00308\teval-rmse:0.11272\n", "[272]\ttrain-rmse:0.00303\teval-rmse:0.11273\n", "[273]\ttrain-rmse:0.00300\teval-rmse:0.11273\n", "[274]\ttrain-rmse:0.00297\teval-rmse:0.11273\n", "[275]\ttrain-rmse:0.00293\teval-rmse:0.11273\n", "[276]\ttrain-rmse:0.00288\teval-rmse:0.11273\n", "[277]\ttrain-rmse:0.00283\teval-rmse:0.11274\n", "[278]\ttrain-rmse:0.00278\teval-rmse:0.11273\n", "[279]\ttrain-rmse:0.00273\teval-rmse:0.11274\n", "[280]\ttrain-rmse:0.00268\teval-rmse:0.11273\n", "[281]\ttrain-rmse:0.00264\teval-rmse:0.11274\n", "[282]\ttrain-rmse:0.00259\teval-rmse:0.11273\n", "[283]\ttrain-rmse:0.00255\teval-rmse:0.11273\n", "[284]\ttrain-rmse:0.00251\teval-rmse:0.11273\n", "[285]\ttrain-rmse:0.00248\teval-rmse:0.11272\n", "[286]\ttrain-rmse:0.00243\teval-rmse:0.11272\n", "[287]\ttrain-rmse:0.00240\teval-rmse:0.11272\n", "[288]\ttrain-rmse:0.00236\teval-rmse:0.11272\n", "[289]\ttrain-rmse:0.00233\teval-rmse:0.11272\n", "[290]\ttrain-rmse:0.00230\teval-rmse:0.11272\n", "[291]\ttrain-rmse:0.00228\teval-rmse:0.11272\n", "[292]\ttrain-rmse:0.00224\teval-rmse:0.11271\n", "[293]\ttrain-rmse:0.00220\teval-rmse:0.11271\n", "[294]\ttrain-rmse:0.00217\teval-rmse:0.11271\n", "[295]\ttrain-rmse:0.00214\teval-rmse:0.11271\n", "[296]\ttrain-rmse:0.00211\teval-rmse:0.11271\n", "[297]\ttrain-rmse:0.00208\teval-rmse:0.11271\n", "[298]\ttrain-rmse:0.00205\teval-rmse:0.11270\n", "[299]\ttrain-rmse:0.00202\teval-rmse:0.11270\n", "[300]\ttrain-rmse:0.00199\teval-rmse:0.11270\n", "[301]\ttrain-rmse:0.00196\teval-rmse:0.11271\n", "[302]\ttrain-rmse:0.00192\teval-rmse:0.11271\n", "[303]\ttrain-rmse:0.00190\teval-rmse:0.11271\n", "[304]\ttrain-rmse:0.00189\teval-rmse:0.11271\n", "[305]\ttrain-rmse:0.00185\teval-rmse:0.11272\n", "[306]\ttrain-rmse:0.00182\teval-rmse:0.11272\n", "[307]\ttrain-rmse:0.00179\teval-rmse:0.11273\n", "[308]\ttrain-rmse:0.00176\teval-rmse:0.11273\n", "[309]\ttrain-rmse:0.00175\teval-rmse:0.11273\n", "[310]\ttrain-rmse:0.00173\teval-rmse:0.11273\n", "[311]\ttrain-rmse:0.00170\teval-rmse:0.11274\n", "[312]\ttrain-rmse:0.00168\teval-rmse:0.11274\n", "[313]\ttrain-rmse:0.00165\teval-rmse:0.11274\n", "[314]\ttrain-rmse:0.00163\teval-rmse:0.11274\n", "[315]\ttrain-rmse:0.00160\teval-rmse:0.11275\n", "[316]\ttrain-rmse:0.00158\teval-rmse:0.11275\n", "[317]\ttrain-rmse:0.00155\teval-rmse:0.11275\n", "[318]\ttrain-rmse:0.00154\teval-rmse:0.11275\n", "[319]\ttrain-rmse:0.00152\teval-rmse:0.11275\n", "[320]\ttrain-rmse:0.00150\teval-rmse:0.11275\n", "[321]\ttrain-rmse:0.00148\teval-rmse:0.11275\n", "[322]\ttrain-rmse:0.00145\teval-rmse:0.11276\n", "[323]\ttrain-rmse:0.00143\teval-rmse:0.11275\n", "[324]\ttrain-rmse:0.00141\teval-rmse:0.11275\n", "[325]\ttrain-rmse:0.00138\teval-rmse:0.11275\n", "[326]\ttrain-rmse:0.00136\teval-rmse:0.11276\n", "[327]\ttrain-rmse:0.00134\teval-rmse:0.11275\n", "[328]\ttrain-rmse:0.00132\teval-rmse:0.11276\n", "[329]\ttrain-rmse:0.00130\teval-rmse:0.11276\n", "[330]\ttrain-rmse:0.00128\teval-rmse:0.11276\n", "[331]\ttrain-rmse:0.00127\teval-rmse:0.11275\n", "[332]\ttrain-rmse:0.00125\teval-rmse:0.11275\n", "[333]\ttrain-rmse:0.00123\teval-rmse:0.11275\n", "[334]\ttrain-rmse:0.00121\teval-rmse:0.11275\n", "[335]\ttrain-rmse:0.00119\teval-rmse:0.11276\n" ] } ], "source": [ "gbm = xgb.train(params, dtrain, evals=watchlist, num_boost_round=5000,\n", " early_stopping_rounds=200, verbose_eval=True)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "x_pred = gbm.predict(xgb.DMatrix(X_test))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "test['SalePrice'] = np.expm1(x_pred)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "test[['Id', 'SalePrice']].to_csv('house_pred2.csv', index=False, encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "gbm.save_model('./pretrain_models/house_price_eta0.05_round280.json')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "gg = xgb.XGBRegressor()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/xgboost/sklearn.py:742: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n", " 'Loading a native XGBoost model with Scikit-Learn interface.'\n" ] } ], "source": [ "gg.load_model('./pretrain_models/house_price_eta0.05_round280.json')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "test['SalePrice'] = np.expm1(gg.predict(X_test))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([11.706002, 12.04607 , 12.116972, ..., 11.978775, 11.649101,\n", " 12.330935], dtype=float32)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_pred" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7.13 ('py37')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "993bd31d5df1020fab369d79a34ff0a2a159e1798f3e25d3ad4b7751d38184c9" } } }, "nbformat": 4, "nbformat_minor": 2 }