add code
|
@ -0,0 +1,601 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import catboost\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from catboost import CatBoostRegressor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>PM2.5</th>\n",
|
||||
" <th>PM10</th>\n",
|
||||
" <th>SO2</th>\n",
|
||||
" <th>NO2</th>\n",
|
||||
" <th>O3</th>\n",
|
||||
" <th>O3_8h</th>\n",
|
||||
" <th>CO</th>\n",
|
||||
" <th>wd</th>\n",
|
||||
" <th>ws</th>\n",
|
||||
" <th>air_temp</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>PM2.5_transportation</th>\n",
|
||||
" <th>PM2.5_resdient</th>\n",
|
||||
" <th>PM2.5_power</th>\n",
|
||||
" <th>pre_PM2.5</th>\n",
|
||||
" <th>pre_PM10</th>\n",
|
||||
" <th>pre_SO2</th>\n",
|
||||
" <th>pre_NO2</th>\n",
|
||||
" <th>pre_O3</th>\n",
|
||||
" <th>pre_O3_8h</th>\n",
|
||||
" <th>pre_CO</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>4.744932</td>\n",
|
||||
" <td>5.176150</td>\n",
|
||||
" <td>5.723585</td>\n",
|
||||
" <td>3.663562</td>\n",
|
||||
" <td>2.197225</td>\n",
|
||||
" <td>2.302585</td>\n",
|
||||
" <td>1.515127</td>\n",
|
||||
" <td>58.0</td>\n",
|
||||
" <td>0.7</td>\n",
|
||||
" <td>-11.1</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.081248</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.418587</td>\n",
|
||||
" <td>136.0</td>\n",
|
||||
" <td>214.0</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>9.0</td>\n",
|
||||
" <td>3.71</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>4.584967</td>\n",
|
||||
" <td>5.043425</td>\n",
|
||||
" <td>5.726848</td>\n",
|
||||
" <td>3.637586</td>\n",
|
||||
" <td>2.079442</td>\n",
|
||||
" <td>2.197225</td>\n",
|
||||
" <td>1.506297</td>\n",
|
||||
" <td>185.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>-11.7</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.088313</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.412773</td>\n",
|
||||
" <td>114.0</td>\n",
|
||||
" <td>176.0</td>\n",
|
||||
" <td>305.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>9.0</td>\n",
|
||||
" <td>3.55</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>4.477337</td>\n",
|
||||
" <td>4.955827</td>\n",
|
||||
" <td>5.758902</td>\n",
|
||||
" <td>3.663562</td>\n",
|
||||
" <td>2.079442</td>\n",
|
||||
" <td>2.197225</td>\n",
|
||||
" <td>1.515127</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.2</td>\n",
|
||||
" <td>-12.7</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.091256</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.424400</td>\n",
|
||||
" <td>97.0</td>\n",
|
||||
" <td>154.0</td>\n",
|
||||
" <td>306.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.51</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4.454347</td>\n",
|
||||
" <td>4.941642</td>\n",
|
||||
" <td>5.680173</td>\n",
|
||||
" <td>3.637586</td>\n",
|
||||
" <td>2.079442</td>\n",
|
||||
" <td>2.197225</td>\n",
|
||||
" <td>1.530395</td>\n",
|
||||
" <td>199.0</td>\n",
|
||||
" <td>1.4</td>\n",
|
||||
" <td>-10.9</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.092434</td>\n",
|
||||
" <td>1.746121</td>\n",
|
||||
" <td>0.459282</td>\n",
|
||||
" <td>87.0</td>\n",
|
||||
" <td>141.0</td>\n",
|
||||
" <td>316.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.55</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>4.672829</td>\n",
|
||||
" <td>5.123964</td>\n",
|
||||
" <td>5.758902</td>\n",
|
||||
" <td>3.637586</td>\n",
|
||||
" <td>2.197225</td>\n",
|
||||
" <td>2.197225</td>\n",
|
||||
" <td>1.605430</td>\n",
|
||||
" <td>359.0</td>\n",
|
||||
" <td>1.2</td>\n",
|
||||
" <td>-12.3</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.170738</td>\n",
|
||||
" <td>3.446292</td>\n",
|
||||
" <td>0.514513</td>\n",
|
||||
" <td>85.0</td>\n",
|
||||
" <td>139.0</td>\n",
|
||||
" <td>292.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.62</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 49 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" PM2.5 PM10 SO2 NO2 O3 O3_8h CO \\\n",
|
||||
"0 4.744932 5.176150 5.723585 3.663562 2.197225 2.302585 1.515127 \n",
|
||||
"1 4.584967 5.043425 5.726848 3.637586 2.079442 2.197225 1.506297 \n",
|
||||
"2 4.477337 4.955827 5.758902 3.663562 2.079442 2.197225 1.515127 \n",
|
||||
"3 4.454347 4.941642 5.680173 3.637586 2.079442 2.197225 1.530395 \n",
|
||||
"4 4.672829 5.123964 5.758902 3.637586 2.197225 2.197225 1.605430 \n",
|
||||
"\n",
|
||||
" wd ws air_temp ... PM2.5_transportation PM2.5_resdient \\\n",
|
||||
"0 58.0 0.7 -11.1 ... 0.081248 0.827110 \n",
|
||||
"1 185.0 0.5 -11.7 ... 0.088313 0.827110 \n",
|
||||
"2 0.0 0.2 -12.7 ... 0.091256 0.827110 \n",
|
||||
"3 199.0 1.4 -10.9 ... 0.092434 1.746121 \n",
|
||||
"4 359.0 1.2 -12.3 ... 0.170738 3.446292 \n",
|
||||
"\n",
|
||||
" PM2.5_power pre_PM2.5 pre_PM10 pre_SO2 pre_NO2 pre_O3 pre_O3_8h \\\n",
|
||||
"0 0.418587 136.0 214.0 317.0 38.0 8.0 9.0 \n",
|
||||
"1 0.412773 114.0 176.0 305.0 38.0 8.0 9.0 \n",
|
||||
"2 0.424400 97.0 154.0 306.0 37.0 7.0 8.0 \n",
|
||||
"3 0.459282 87.0 141.0 316.0 38.0 7.0 8.0 \n",
|
||||
"4 0.514513 85.0 139.0 292.0 37.0 7.0 8.0 \n",
|
||||
"\n",
|
||||
" pre_CO \n",
|
||||
"0 3.71 \n",
|
||||
"1 3.55 \n",
|
||||
"2 3.51 \n",
|
||||
"3 3.55 \n",
|
||||
"4 3.62 \n",
|
||||
"\n",
|
||||
"[5 rows x 49 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = pd.read_csv('./data/train_data_mod.csv')\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index(['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'O3_8h', 'CO'], dtype='object')"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"feature_cols = data.columns[7:]\n",
|
||||
"out_cols = data.columns[:7]\n",
|
||||
"out_cols"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"train_X, test_X, train_y, test_y = train_test_split(data[feature_cols], data[out_cols], test_size=0.2,\n",
|
||||
" random_state=42)\n",
|
||||
"#准备参数\n",
|
||||
"other_params = {'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,\n",
|
||||
" 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}\n",
|
||||
"\n",
|
||||
"params_gbm = {\n",
|
||||
" 'task': 'train',\n",
|
||||
" 'boosting_type': 'gbdt', # 设置提升类型\n",
|
||||
" 'objective': 'l1', # 目标函数\n",
|
||||
" 'metric': 'rmse', # 评估函数\n",
|
||||
" 'max_depth': 10,\n",
|
||||
" 'num_leaves': 20, # 叶子节点数\n",
|
||||
" 'learning_rate': 0.09, # 学习速率\n",
|
||||
" 'feature_fraction': 0.9, # 建树的特征选择比例\n",
|
||||
" 'bagging_fraction': 0.9, # 建树的样本采样比例\n",
|
||||
" 'bagging_freq': 10, # k 意味着每 k 次迭代执行bagging\n",
|
||||
" 'verbose': -1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lightgbm as lgb\n",
|
||||
"from sklearn.multioutput import MultiOutputRegressor\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"base_model = lgb.LGBMRegressor(**params_gbm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"base_cat = CatBoostRegressor(iterations=1000, learning_rate=0.0005, depth=10, loss_function='RMSE', eval_metric='RMSE', random_seed=99, od_type='Iter', od_wait=50, verbose=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"multioutputregressor = MultiOutputRegressor(base_cat).fit(train_X, train_y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"rst = multioutputregressor.predict(test_X)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>PM2.5</th>\n",
|
||||
" <th>PM10</th>\n",
|
||||
" <th>SO2</th>\n",
|
||||
" <th>NO2</th>\n",
|
||||
" <th>O3</th>\n",
|
||||
" <th>O3_8h</th>\n",
|
||||
" <th>CO</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>3.653185</td>\n",
|
||||
" <td>4.700200</td>\n",
|
||||
" <td>2.722381</td>\n",
|
||||
" <td>3.261589</td>\n",
|
||||
" <td>3.836444</td>\n",
|
||||
" <td>3.857181</td>\n",
|
||||
" <td>0.615219</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>4.323887</td>\n",
|
||||
" <td>4.923196</td>\n",
|
||||
" <td>3.198502</td>\n",
|
||||
" <td>4.016752</td>\n",
|
||||
" <td>3.166474</td>\n",
|
||||
" <td>3.591639</td>\n",
|
||||
" <td>0.824886</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3.660165</td>\n",
|
||||
" <td>4.662362</td>\n",
|
||||
" <td>3.136948</td>\n",
|
||||
" <td>3.513742</td>\n",
|
||||
" <td>3.763910</td>\n",
|
||||
" <td>3.671770</td>\n",
|
||||
" <td>0.631061</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>3.728112</td>\n",
|
||||
" <td>4.645958</td>\n",
|
||||
" <td>3.514411</td>\n",
|
||||
" <td>3.718547</td>\n",
|
||||
" <td>3.199907</td>\n",
|
||||
" <td>3.291750</td>\n",
|
||||
" <td>0.862777</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>4.189668</td>\n",
|
||||
" <td>4.743439</td>\n",
|
||||
" <td>3.445615</td>\n",
|
||||
" <td>3.674801</td>\n",
|
||||
" <td>3.949052</td>\n",
|
||||
" <td>3.695285</td>\n",
|
||||
" <td>0.797655</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8902</th>\n",
|
||||
" <td>4.283530</td>\n",
|
||||
" <td>4.995899</td>\n",
|
||||
" <td>4.019444</td>\n",
|
||||
" <td>3.961054</td>\n",
|
||||
" <td>3.663294</td>\n",
|
||||
" <td>3.314231</td>\n",
|
||||
" <td>0.916339</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8903</th>\n",
|
||||
" <td>3.674866</td>\n",
|
||||
" <td>4.606504</td>\n",
|
||||
" <td>3.470283</td>\n",
|
||||
" <td>3.307148</td>\n",
|
||||
" <td>3.608739</td>\n",
|
||||
" <td>3.626463</td>\n",
|
||||
" <td>0.860751</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8904</th>\n",
|
||||
" <td>3.704409</td>\n",
|
||||
" <td>4.350563</td>\n",
|
||||
" <td>3.757374</td>\n",
|
||||
" <td>3.636318</td>\n",
|
||||
" <td>3.601366</td>\n",
|
||||
" <td>3.539929</td>\n",
|
||||
" <td>0.862651</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8905</th>\n",
|
||||
" <td>3.724967</td>\n",
|
||||
" <td>4.673218</td>\n",
|
||||
" <td>3.218182</td>\n",
|
||||
" <td>3.765976</td>\n",
|
||||
" <td>3.386151</td>\n",
|
||||
" <td>2.954136</td>\n",
|
||||
" <td>0.730686</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8906</th>\n",
|
||||
" <td>3.419836</td>\n",
|
||||
" <td>4.188060</td>\n",
|
||||
" <td>3.019416</td>\n",
|
||||
" <td>3.307404</td>\n",
|
||||
" <td>3.861704</td>\n",
|
||||
" <td>3.746484</td>\n",
|
||||
" <td>0.702885</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>8907 rows × 7 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" PM2.5 PM10 SO2 NO2 O3 O3_8h CO\n",
|
||||
"0 3.653185 4.700200 2.722381 3.261589 3.836444 3.857181 0.615219\n",
|
||||
"1 4.323887 4.923196 3.198502 4.016752 3.166474 3.591639 0.824886\n",
|
||||
"2 3.660165 4.662362 3.136948 3.513742 3.763910 3.671770 0.631061\n",
|
||||
"3 3.728112 4.645958 3.514411 3.718547 3.199907 3.291750 0.862777\n",
|
||||
"4 4.189668 4.743439 3.445615 3.674801 3.949052 3.695285 0.797655\n",
|
||||
"... ... ... ... ... ... ... ...\n",
|
||||
"8902 4.283530 4.995899 4.019444 3.961054 3.663294 3.314231 0.916339\n",
|
||||
"8903 3.674866 4.606504 3.470283 3.307148 3.608739 3.626463 0.860751\n",
|
||||
"8904 3.704409 4.350563 3.757374 3.636318 3.601366 3.539929 0.862651\n",
|
||||
"8905 3.724967 4.673218 3.218182 3.765976 3.386151 2.954136 0.730686\n",
|
||||
"8906 3.419836 4.188060 3.019416 3.307404 3.861704 3.746484 0.702885\n",
|
||||
"\n",
|
||||
"[8907 rows x 7 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"out_results = pd.DataFrame(rst, columns=out_cols)\n",
|
||||
"out_results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"COL: PM2.5, MSE: 2.30E-01,RMSE: 0.4799,MAPE: 9.99 %,MAE: 0.3802,R_2: -2.1369\n",
|
||||
"COL: PM10, MSE: 1.64E-01,RMSE: 0.4053,MAPE: 6.97 %,MAE: 0.3164,R_2: -2.1513\n",
|
||||
"COL: SO2, MSE: 4.32E-01,RMSE: 0.6574,MAPE: 16.439999999999998 %,MAE: 0.5326,R_2: -2.0811\n",
|
||||
"COL: NO2, MSE: 1.48E-01,RMSE: 0.3843,MAPE: 8.52 %,MAE: 0.3095,R_2: -2.3884\n",
|
||||
"COL: O3, MSE: 4.99E-01,RMSE: 0.7061,MAPE: 17.419999999999998 %,MAE: 0.5898,R_2: -2.0369\n",
|
||||
"COL: O3_8h, MSE: 4.19E-01,RMSE: 0.6471,MAPE: 15.73 %,MAE: 0.5331,R_2: -1.936\n",
|
||||
"COL: CO, MSE: 3.39E-02,RMSE: 0.1842,MAPE: 18.75 %,MAE: 0.1439,R_2: -2.1239\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for col in out_cols:\n",
|
||||
" MSE = mean_squared_error(out_results[col].values, test_y[col].values)\n",
|
||||
" RMSE = np.sqrt(MSE)\n",
|
||||
" MAE = mean_absolute_error(out_results[col].values, test_y[col].values)\n",
|
||||
" MAPE = mean_absolute_percentage_error(out_results[col].values, test_y[col].values)\n",
|
||||
" R_2 = r2_score(out_results[col].values, test_y[col].values)\n",
|
||||
" print(f\"COL: {col}, MSE: {format(MSE, '.2E')}\", end=',')\n",
|
||||
" print(f'RMSE: {round(RMSE, 4)}', end=',')\n",
|
||||
" print(f'MAPE: {round(MAPE, 4) * 100} %', end=',')\n",
|
||||
" print(f'MAE: {round(MAE, 4)}', end=',')\n",
|
||||
" print(f'R_2: {round(R_2, 4)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "py37",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.13"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "993bd31d5df1020fab369d79a34ff0a2a159e1798f3e25d3ad4b7751d38184c9"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
After Width: | Height: | Size: 1.6 MiB |
After Width: | Height: | Size: 2.0 MiB |
After Width: | Height: | Size: 1.7 MiB |
After Width: | Height: | Size: 1.7 MiB |
After Width: | Height: | Size: 1.7 MiB |
After Width: | Height: | Size: 1.6 MiB |
After Width: | Height: | Size: 1.8 MiB |
After Width: | Height: | Size: 2.3 MiB |
After Width: | Height: | Size: 1.9 MiB |
After Width: | Height: | Size: 2.1 MiB |
After Width: | Height: | Size: 1.9 MiB |
After Width: | Height: | Size: 1.8 MiB |
|
@ -0,0 +1,68 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "30222419-bfdf-4c4b-ada1-5aafbf27bc88",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"按照图示完成多任务学习的网络结构,部分网络层共享参数"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "573727b3-ebeb-4a34-ad53-7596363052fe",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from torch import nn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8d08f996-b93e-4e42-a889-b85aa438f4df",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class MTLNN(nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super(MTLNN, self).__init__()\n",
|
||||
" self.main_branch = nn.Sequential(\n",
|
||||
" nn.\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "py37",
|
||||
"language": "python",
|
||||
"name": "py37"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -0,0 +1,166 @@
|
|||
from cmath import log
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from prophet import Prophet
|
||||
import datetime as dt
|
||||
from get_holiday_cn.client import getHoliday
|
||||
from logzero import logger
|
||||
import pickle
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def concat_date(x:str, y:str):
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
x (str): 年月日
|
||||
y (str): 小时
|
||||
|
||||
Returns:
|
||||
_type_: 合成的时间
|
||||
"""
|
||||
time_str = f"{x} {y}:00:00"
|
||||
return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")
|
||||
|
||||
|
||||
def load_data():
|
||||
data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')]
|
||||
data_folder.sort()
|
||||
# 一个读取数据并合成成一个大文件的函数
|
||||
total_data = pd.DataFrame()
|
||||
for folder in data_folder:
|
||||
files = os.listdir(f"./data/{folder}")
|
||||
files.sort()
|
||||
for file in files:
|
||||
if file.endswith('csv'):
|
||||
data = pd.read_csv(f'./data/{folder}/{file}')
|
||||
use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy()
|
||||
total_data = pd.concat([total_data, use_data])
|
||||
total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1)
|
||||
total_data.ds = pd.to_datetime(total_data.ds)
|
||||
total_data.sort_values(by='ds', ascending=True, inplace=True)
|
||||
total_data.reset_index(drop=True, inplace=True)
|
||||
logger.info(f"总数据集大小:{total_data.shape}")
|
||||
return total_data
|
||||
|
||||
|
||||
def build_model(city: str, data: pd.DataFrame, dtype:str, holiday_mode:dict, split_date="2021-01-01 00:00:00"):
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
city (str): 城市
|
||||
data (pd.DataFrame): 数据
|
||||
dtype (str): O3还是PM2.5
|
||||
holiday_mode (dict): 假期字典
|
||||
split_date (str, optional): 划分训练测试的分割日期. Defaults to "2021-01-01".
|
||||
|
||||
Returns:
|
||||
model: 模型
|
||||
forecast: 对该组数据的预测
|
||||
"""
|
||||
logger.info(f"选择了 {city} 的 {dtype} 数据,")
|
||||
use_data = data[(data['type']==dtype)][["ds", city]].copy()
|
||||
use_data.columns = ["ds", "y"]
|
||||
train_data = use_data[use_data.ds < split_date].copy()
|
||||
logger.info(train_data.iloc[-1].ds)
|
||||
test_data = use_data[use_data.ds >= split_date].copy()
|
||||
model=Prophet(
|
||||
growth="linear",
|
||||
yearly_seasonality=True,
|
||||
weekly_seasonality=True,
|
||||
daily_seasonality=True,
|
||||
seasonality_mode="multiplicative",
|
||||
seasonality_prior_scale=12,
|
||||
holidays=holiday_mode,
|
||||
n_changepoints= 100, # change points num, default=25
|
||||
)
|
||||
model.fit(train_data)
|
||||
future = model.make_future_dataframe(365*24, freq='H', include_history=True)
|
||||
forecast=model.predict(future)
|
||||
model.plot_components(forecast)
|
||||
plt.savefig(f'./figure/{city}_{dtype}_components.png')
|
||||
return model, forecast
|
||||
|
||||
|
||||
def get_date_type(date:str, holiday_client:getHoliday):
|
||||
"""一个判断某个日期是哪种假期的类
|
||||
|
||||
Args:
|
||||
date (str): "YYYY-MM-DD"
|
||||
holiday_client (getHoliday): object of getHoliday class
|
||||
|
||||
Returns:
|
||||
str: oridinary for simple day and others for special day
|
||||
"""
|
||||
rst = holiday_client.assemble_holiday_data(today=date)
|
||||
if rst.get('code') == 0:
|
||||
if rst.get('holiday') is None:
|
||||
return 'oridinary'
|
||||
else:
|
||||
return rst.get('holiday').get('name')
|
||||
|
||||
|
||||
def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"):
|
||||
"""基于起止日期,将该时间段内的国内假期都找出来,包括本应该放假但是最后调休上班的
|
||||
|
||||
Args:
|
||||
start_date (str): 以"YYYY-MM-DD"形式的字符串, 默认2015-01-01
|
||||
end_date (_type_): 以"YYYY-MM-DD"形式的字符串,默认2021-12-31
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date'])
|
||||
ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d'))
|
||||
client = getHoliday()
|
||||
ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
|
||||
special_date = ds_list[ds_list.day_type != 'simple'].copy()
|
||||
special_date.columns = ['ds', 'holiday']
|
||||
return special_date
|
||||
|
||||
def train(data_type, city_list, data):
|
||||
model_dict = dict()
|
||||
predict_dict = dict()
|
||||
holiday_data = build_holiday(data.ds.min(), data.ds.max())
|
||||
for city in city_list:
|
||||
model, pred = build_model(city, data, data_type, holiday_data, '2021-01-01')
|
||||
model_dict[city] = model
|
||||
predict_dict[city] = pred
|
||||
logger.info(f"{city} 模型构建完成")
|
||||
if not os.path.exists('./result/'):
|
||||
os.mkdir('./result/')
|
||||
if not os.path.exists(f'./result/{data_type}/'):
|
||||
os.mkdir(f'./result/{data_type}')
|
||||
if not os.path.exists(f'./result/{data_type}/model/'):
|
||||
os.mkdir(f'./result/{data_type}/model')
|
||||
if not os.path.exists(f'./result/{data_type}/data/'):
|
||||
os.mkdir(f'./result/{data_type}/data/')
|
||||
for city in predict_dict:
|
||||
city_pred = predict_dict.get(city)
|
||||
city_pred.to_csv(f'./result/{data_type}/data/{city}.csv', encoding='utf-8', index=False)
|
||||
logger.info(f"{city} 预测数据保存完成")
|
||||
for city in model_dict:
|
||||
city_model = model_dict.get(city)
|
||||
with open(f'./result/{data_type}/model/{city}.pkl', 'wb') as fwb:
|
||||
pickle.dump(city_model, fwb)
|
||||
logger.info(f"{city} 模型保存完成")
|
||||
|
||||
return model_dict, predict_dict
|
||||
|
||||
if __name__ == '__main__':
|
||||
data_type = 'O3' # 修改此处以切换数据类型
|
||||
city_list = ['北京'] # 修改此处以添加城市
|
||||
if os.path.exists('./data/total_data.csv'):
|
||||
data = pd.read_csv('./data/total_data.csv')
|
||||
else:
|
||||
data = load_data()
|
||||
data.to_csv('./data/total_data.csv', encoding='utf-8', index=False)
|
||||
|
||||
model_dict, pred_list = train(data_type, city_list, data)
|
||||
'''
|
||||
# if test
|
||||
# 从存储的模型中加载
|
||||
with open('./result/O3/model/北京.pkl', 'rb') as fr:
|
||||
local_model = pickle.load(fr)
|
||||
'''
|
|
@ -0,0 +1,33 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from prophet import Prophet
|
||||
from sklearn.metrics import mean_absolute_error
|
||||
path="E:/Predition/data/environment_data/城市/处理表头数据/daily_results/normal_daily_results"
|
||||
df = pd.read_csv('E:/Predition/data/environment_data/城市/处理表头数据/daily_results/normal_daily_results/test.csv',index_col= 'date',parse_dates=['date'])
|
||||
print(df.shape)
|
||||
df.reset_index(inplace=True)
|
||||
df.columns = ["ds", "y"]
|
||||
output_file = os.path.join(path, 'test-output.csv')
|
||||
df.to_csv(output_file)
|
||||
train = df[:int(df.shape[0]*0.7)]
|
||||
test=df[int(df.shape[0]*0.7):]
|
||||
train.shape, test.shape
|
||||
model=Prophet(growth="linear",
|
||||
yearly_seasonality=True,
|
||||
weekly_seasonality=False,
|
||||
daily_seasonality=False,
|
||||
seasonality_mode="multiplicative",
|
||||
seasonality_prior_scale=12,
|
||||
)
|
||||
model.fit(train)
|
||||
forecast=model.predict(test)
|
||||
print(forecast)
|
||||
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
|
||||
new_y = df.y.fillna('0')
|
||||
df['new_y'] = new_y.values
|
||||
df.dropna()
|
||||
na_index =test[test.y.isna()].ds.values
|
||||
na_index
|
||||
forecast.tail()
|
||||
print("在测试集上绝对值预测误差为:",mean_absolute_error(new_df.y.values,forecast.yhat.values))
|
|
@ -0,0 +1,918 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(52583, 54)"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = pd.read_csv('./data/ori_data.csv')\n",
|
||||
"data.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>PM2.5</th>\n",
|
||||
" <th>PM10</th>\n",
|
||||
" <th>SO2</th>\n",
|
||||
" <th>NO2</th>\n",
|
||||
" <th>O3</th>\n",
|
||||
" <th>CO</th>\n",
|
||||
" <th>Ox</th>\n",
|
||||
" <th>wind-U</th>\n",
|
||||
" <th>wind-V</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>VOC_power</th>\n",
|
||||
" <th>VOC_agricultural</th>\n",
|
||||
" <th>PM2.5_industrial</th>\n",
|
||||
" <th>PM2.5_transportation</th>\n",
|
||||
" <th>PM2.5_resdient</th>\n",
|
||||
" <th>PM2.5_power</th>\n",
|
||||
" <th>PM2.5_agricultural</th>\n",
|
||||
" <th>CO_Bio</th>\n",
|
||||
" <th>VOCs_Bio</th>\n",
|
||||
" <th>pre_time</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2015-01-02 01:00:00</td>\n",
|
||||
" <td>136.0</td>\n",
|
||||
" <td>214.0</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.71</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>0.831775</td>\n",
|
||||
" <td>-0.555113</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.037724</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.926851</td>\n",
|
||||
" <td>0.077715</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.436028</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.081546</td>\n",
|
||||
" <td>4.217706</td>\n",
|
||||
" <td>2015-01-02 00:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2015-01-02 02:00:00</td>\n",
|
||||
" <td>114.0</td>\n",
|
||||
" <td>176.0</td>\n",
|
||||
" <td>305.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.55</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>-0.695011</td>\n",
|
||||
" <td>-0.083426</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.036215</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.926851</td>\n",
|
||||
" <td>0.081248</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.418587</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.080031</td>\n",
|
||||
" <td>4.119807</td>\n",
|
||||
" <td>2015-01-02 01:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2015-01-02 03:00:00</td>\n",
|
||||
" <td>97.0</td>\n",
|
||||
" <td>154.0</td>\n",
|
||||
" <td>306.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>3.51</td>\n",
|
||||
" <td>44.0</td>\n",
|
||||
" <td>-0.173311</td>\n",
|
||||
" <td>0.469003</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.035712</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.926851</td>\n",
|
||||
" <td>0.088313</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.412773</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.077761</td>\n",
|
||||
" <td>3.973464</td>\n",
|
||||
" <td>2015-01-02 02:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2015-01-02 04:00:00</td>\n",
|
||||
" <td>87.0</td>\n",
|
||||
" <td>141.0</td>\n",
|
||||
" <td>316.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>3.55</td>\n",
|
||||
" <td>45.0</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>-0.200000</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.036718</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.926851</td>\n",
|
||||
" <td>0.091256</td>\n",
|
||||
" <td>0.827110</td>\n",
|
||||
" <td>0.424400</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.076766</td>\n",
|
||||
" <td>3.909235</td>\n",
|
||||
" <td>2015-01-02 03:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2015-01-02 05:00:00</td>\n",
|
||||
" <td>85.0</td>\n",
|
||||
" <td>139.0</td>\n",
|
||||
" <td>292.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>3.62</td>\n",
|
||||
" <td>44.0</td>\n",
|
||||
" <td>1.234518</td>\n",
|
||||
" <td>0.660276</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.039736</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.926851</td>\n",
|
||||
" <td>0.092434</td>\n",
|
||||
" <td>1.746121</td>\n",
|
||||
" <td>0.459282</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.077119</td>\n",
|
||||
" <td>3.930702</td>\n",
|
||||
" <td>2015-01-02 04:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 54 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date PM2.5 PM10 SO2 NO2 O3 CO Ox wind-U \\\n",
|
||||
"0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 0.831775 \n",
|
||||
"1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 -0.695011 \n",
|
||||
"2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 -0.173311 \n",
|
||||
"3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 0.000000 \n",
|
||||
"4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 1.234518 \n",
|
||||
"\n",
|
||||
" wind-V ... VOC_power VOC_agricultural PM2.5_industrial \\\n",
|
||||
"0 -0.555113 ... 0.037724 0.0 0.926851 \n",
|
||||
"1 -0.083426 ... 0.036215 0.0 0.926851 \n",
|
||||
"2 0.469003 ... 0.035712 0.0 0.926851 \n",
|
||||
"3 -0.200000 ... 0.036718 0.0 0.926851 \n",
|
||||
"4 0.660276 ... 0.039736 0.0 0.926851 \n",
|
||||
"\n",
|
||||
" PM2.5_transportation PM2.5_resdient PM2.5_power PM2.5_agricultural \\\n",
|
||||
"0 0.077715 0.827110 0.436028 0.0 \n",
|
||||
"1 0.081248 0.827110 0.418587 0.0 \n",
|
||||
"2 0.088313 0.827110 0.412773 0.0 \n",
|
||||
"3 0.091256 0.827110 0.424400 0.0 \n",
|
||||
"4 0.092434 1.746121 0.459282 0.0 \n",
|
||||
"\n",
|
||||
" CO_Bio VOCs_Bio pre_time \n",
|
||||
"0 0.081546 4.217706 2015-01-02 00:00:00 \n",
|
||||
"1 0.080031 4.119807 2015-01-02 01:00:00 \n",
|
||||
"2 0.077761 3.973464 2015-01-02 02:00:00 \n",
|
||||
"3 0.076766 3.909235 2015-01-02 03:00:00 \n",
|
||||
"4 0.077119 3.930702 2015-01-02 04:00:00 \n",
|
||||
"\n",
|
||||
"[5 rows x 54 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.date = pd.to_datetime(data.date)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import datetime as dt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_lookdays(x):\n",
|
||||
" start = x - dt.timedelta(hours=24)\n",
|
||||
" end = x - dt.timedelta(hours=1)\n",
|
||||
" period = pd.date_range(start, end, freq='H')\n",
|
||||
" return [dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S') for x in period.tolist()]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"date_range = pd.date_range(data.date.min(), data.date.max(), freq='H')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'CO']"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"out_cols = data.columns[1:7].tolist()\n",
|
||||
"out_cols"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data['day'] = data.date.apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))\n",
|
||||
"na_counts = data.set_index('day')[out_cols].isna().groupby('day').sum()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"drop_days = na_counts[na_counts.SO2>5].index.values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = data.set_index('date').interpolate(method='linear')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = data.reset_index()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = data[~data.date.isin(drop_days)].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = data.set_index('date').reindex(date_range)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.reset_index(inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.rename(columns={'index':'date'}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data['lookdays'] = data.date.apply(get_lookdays)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data['features'] = data.lookdays.apply(lambda x: data[data.date.isin(x)][out_cols].values.reshape(-1,).tolist())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data['feature_len'] = data.features.apply(lambda x: len(x))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"save_data = data[data.feature_len >=144].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pre_cols = list()\n",
|
||||
"for i in range(24, 0, -1):\n",
|
||||
" for j in out_cols:\n",
|
||||
" pre_cols.append(f\"{i}_{j}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"previous_out = save_data.features.apply(pd.Series, index=pre_cols)\n",
|
||||
"previous_out['date'] = save_data.date.values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>PM2.5</th>\n",
|
||||
" <th>PM10</th>\n",
|
||||
" <th>SO2</th>\n",
|
||||
" <th>NO2</th>\n",
|
||||
" <th>O3</th>\n",
|
||||
" <th>CO</th>\n",
|
||||
" <th>Ox</th>\n",
|
||||
" <th>wind-U</th>\n",
|
||||
" <th>wind-V</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>2_SO2</th>\n",
|
||||
" <th>2_NO2</th>\n",
|
||||
" <th>2_O3</th>\n",
|
||||
" <th>2_CO</th>\n",
|
||||
" <th>1_PM2.5</th>\n",
|
||||
" <th>1_PM10</th>\n",
|
||||
" <th>1_SO2</th>\n",
|
||||
" <th>1_NO2</th>\n",
|
||||
" <th>1_O3</th>\n",
|
||||
" <th>1_CO</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2015-01-02 01:00:00</td>\n",
|
||||
" <td>136.0</td>\n",
|
||||
" <td>214.0</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.71</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>0.831775</td>\n",
|
||||
" <td>-0.555113</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2015-01-02 02:00:00</td>\n",
|
||||
" <td>114.0</td>\n",
|
||||
" <td>176.0</td>\n",
|
||||
" <td>305.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>8.0</td>\n",
|
||||
" <td>3.55</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>-0.695011</td>\n",
|
||||
" <td>-0.083426</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2015-01-02 03:00:00</td>\n",
|
||||
" <td>97.0</td>\n",
|
||||
" <td>154.0</td>\n",
|
||||
" <td>306.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>3.51</td>\n",
|
||||
" <td>44.0</td>\n",
|
||||
" <td>-0.173311</td>\n",
|
||||
" <td>0.469003</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2015-01-02 04:00:00</td>\n",
|
||||
" <td>87.0</td>\n",
|
||||
" <td>141.0</td>\n",
|
||||
" <td>316.0</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>3.55</td>\n",
|
||||
" <td>45.0</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>-0.200000</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2015-01-02 05:00:00</td>\n",
|
||||
" <td>85.0</td>\n",
|
||||
" <td>139.0</td>\n",
|
||||
" <td>292.0</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>3.62</td>\n",
|
||||
" <td>44.0</td>\n",
|
||||
" <td>1.234518</td>\n",
|
||||
" <td>0.660276</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>52578</th>\n",
|
||||
" <td>2020-12-31 19:00:00</td>\n",
|
||||
" <td>27.0</td>\n",
|
||||
" <td>51.0</td>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>0.72</td>\n",
|
||||
" <td>75.0</td>\n",
|
||||
" <td>1.067581</td>\n",
|
||||
" <td>-0.265087</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>25.0</td>\n",
|
||||
" <td>34.0</td>\n",
|
||||
" <td>43.0</td>\n",
|
||||
" <td>0.75</td>\n",
|
||||
" <td>31.0</td>\n",
|
||||
" <td>59.0</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>0.91</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>52579</th>\n",
|
||||
" <td>2020-12-31 20:00:00</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>51.0</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0.83</td>\n",
|
||||
" <td>73.0</td>\n",
|
||||
" <td>0.029164</td>\n",
|
||||
" <td>0.298579</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>0.91</td>\n",
|
||||
" <td>27.0</td>\n",
|
||||
" <td>51.0</td>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>0.72</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>52580</th>\n",
|
||||
" <td>2020-12-31 21:00:00</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>58.0</td>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>48.0</td>\n",
|
||||
" <td>25.0</td>\n",
|
||||
" <td>1.15</td>\n",
|
||||
" <td>73.0</td>\n",
|
||||
" <td>-0.079532</td>\n",
|
||||
" <td>0.896479</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>46.0</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>0.72</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>51.0</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0.83</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>52581</th>\n",
|
||||
" <td>2020-12-31 22:00:00</td>\n",
|
||||
" <td>32.0</td>\n",
|
||||
" <td>60.0</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>49.0</td>\n",
|
||||
" <td>20.0</td>\n",
|
||||
" <td>0.90</td>\n",
|
||||
" <td>69.0</td>\n",
|
||||
" <td>-1.660193</td>\n",
|
||||
" <td>0.365729</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0.83</td>\n",
|
||||
" <td>29.0</td>\n",
|
||||
" <td>58.0</td>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>48.0</td>\n",
|
||||
" <td>25.0</td>\n",
|
||||
" <td>1.15</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>52582</th>\n",
|
||||
" <td>2020-12-31 23:00:00</td>\n",
|
||||
" <td>53.0</td>\n",
|
||||
" <td>94.0</td>\n",
|
||||
" <td>41.0</td>\n",
|
||||
" <td>57.0</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>1.18</td>\n",
|
||||
" <td>69.0</td>\n",
|
||||
" <td>-0.106042</td>\n",
|
||||
" <td>1.195305</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>48.0</td>\n",
|
||||
" <td>25.0</td>\n",
|
||||
" <td>1.15</td>\n",
|
||||
" <td>32.0</td>\n",
|
||||
" <td>60.0</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>49.0</td>\n",
|
||||
" <td>20.0</td>\n",
|
||||
" <td>0.90</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>52583 rows × 200 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date PM2.5 PM10 SO2 NO2 O3 CO Ox \\\n",
|
||||
"0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 \n",
|
||||
"1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 \n",
|
||||
"2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 \n",
|
||||
"3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 \n",
|
||||
"4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 \n",
|
||||
"... ... ... ... ... ... ... ... ... \n",
|
||||
"52578 2020-12-31 19:00:00 27.0 51.0 16.0 46.0 29.0 0.72 75.0 \n",
|
||||
"52579 2020-12-31 20:00:00 26.0 51.0 12.0 47.0 26.0 0.83 73.0 \n",
|
||||
"52580 2020-12-31 21:00:00 29.0 58.0 16.0 48.0 25.0 1.15 73.0 \n",
|
||||
"52581 2020-12-31 22:00:00 32.0 60.0 23.0 49.0 20.0 0.90 69.0 \n",
|
||||
"52582 2020-12-31 23:00:00 53.0 94.0 41.0 57.0 12.0 1.18 69.0 \n",
|
||||
"\n",
|
||||
" wind-U wind-V ... 2_SO2 2_NO2 2_O3 2_CO 1_PM2.5 1_PM10 \\\n",
|
||||
"0 0.831775 -0.555113 ... NaN NaN NaN NaN NaN NaN \n",
|
||||
"1 -0.695011 -0.083426 ... NaN NaN NaN NaN NaN NaN \n",
|
||||
"2 -0.173311 0.469003 ... NaN NaN NaN NaN NaN NaN \n",
|
||||
"3 0.000000 -0.200000 ... NaN NaN NaN NaN NaN NaN \n",
|
||||
"4 1.234518 0.660276 ... NaN NaN NaN NaN NaN NaN \n",
|
||||
"... ... ... ... ... ... ... ... ... ... \n",
|
||||
"52578 1.067581 -0.265087 ... 25.0 34.0 43.0 0.75 31.0 59.0 \n",
|
||||
"52579 0.029164 0.298579 ... 21.0 47.0 29.0 0.91 27.0 51.0 \n",
|
||||
"52580 -0.079532 0.896479 ... 16.0 46.0 29.0 0.72 26.0 51.0 \n",
|
||||
"52581 -1.660193 0.365729 ... 12.0 47.0 26.0 0.83 29.0 58.0 \n",
|
||||
"52582 -0.106042 1.195305 ... 16.0 48.0 25.0 1.15 32.0 60.0 \n",
|
||||
"\n",
|
||||
" 1_SO2 1_NO2 1_O3 1_CO \n",
|
||||
"0 NaN NaN NaN NaN \n",
|
||||
"1 NaN NaN NaN NaN \n",
|
||||
"2 NaN NaN NaN NaN \n",
|
||||
"3 NaN NaN NaN NaN \n",
|
||||
"4 NaN NaN NaN NaN \n",
|
||||
"... ... ... ... ... \n",
|
||||
"52578 21.0 47.0 29.0 0.91 \n",
|
||||
"52579 16.0 46.0 29.0 0.72 \n",
|
||||
"52580 12.0 47.0 26.0 0.83 \n",
|
||||
"52581 16.0 48.0 25.0 1.15 \n",
|
||||
"52582 23.0 49.0 20.0 0.90 \n",
|
||||
"\n",
|
||||
"[52583 rows x 200 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.drop(columns=['features', 'feature_len']).merge(previous_out, on='date', how='left')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_data = previous_out.merge(data.drop(columns=['features', 'feature_len', 'lookdays', 'pre_time']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"drop_cols = [x for x in new_data.columns if 'agricultural' in x] + ['NH3_power'] + ['CO_Bio', 'VOCs_Bio']\n",
|
||||
"drop_cols.remove('NH3_agricultural')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_data.drop(columns=drop_cols, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_data.dropna(inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_data.drop(columns=['day'], inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(49014, 188)"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"new_data.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_data.set_index('date').to_csv('new_train_data.csv', encoding='utf-8-sig')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "py37",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.13"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|