This commit is contained in:
赵敬皓 2023-03-30 10:25:44 +08:00
parent 05834230c0
commit 25a6d1bb2e
25 changed files with 14852 additions and 0 deletions

601
catboost.ipynb Normal file
View File

@ -0,0 +1,601 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import catboost\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from catboost import CatBoostRegressor"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PM2.5</th>\n",
" <th>PM10</th>\n",
" <th>SO2</th>\n",
" <th>NO2</th>\n",
" <th>O3</th>\n",
" <th>O3_8h</th>\n",
" <th>CO</th>\n",
" <th>wd</th>\n",
" <th>ws</th>\n",
" <th>air_temp</th>\n",
" <th>...</th>\n",
" <th>PM2.5_transportation</th>\n",
" <th>PM2.5_resdient</th>\n",
" <th>PM2.5_power</th>\n",
" <th>pre_PM2.5</th>\n",
" <th>pre_PM10</th>\n",
" <th>pre_SO2</th>\n",
" <th>pre_NO2</th>\n",
" <th>pre_O3</th>\n",
" <th>pre_O3_8h</th>\n",
" <th>pre_CO</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.744932</td>\n",
" <td>5.176150</td>\n",
" <td>5.723585</td>\n",
" <td>3.663562</td>\n",
" <td>2.197225</td>\n",
" <td>2.302585</td>\n",
" <td>1.515127</td>\n",
" <td>58.0</td>\n",
" <td>0.7</td>\n",
" <td>-11.1</td>\n",
" <td>...</td>\n",
" <td>0.081248</td>\n",
" <td>0.827110</td>\n",
" <td>0.418587</td>\n",
" <td>136.0</td>\n",
" <td>214.0</td>\n",
" <td>317.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>9.0</td>\n",
" <td>3.71</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.584967</td>\n",
" <td>5.043425</td>\n",
" <td>5.726848</td>\n",
" <td>3.637586</td>\n",
" <td>2.079442</td>\n",
" <td>2.197225</td>\n",
" <td>1.506297</td>\n",
" <td>185.0</td>\n",
" <td>0.5</td>\n",
" <td>-11.7</td>\n",
" <td>...</td>\n",
" <td>0.088313</td>\n",
" <td>0.827110</td>\n",
" <td>0.412773</td>\n",
" <td>114.0</td>\n",
" <td>176.0</td>\n",
" <td>305.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>9.0</td>\n",
" <td>3.55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.477337</td>\n",
" <td>4.955827</td>\n",
" <td>5.758902</td>\n",
" <td>3.663562</td>\n",
" <td>2.079442</td>\n",
" <td>2.197225</td>\n",
" <td>1.515127</td>\n",
" <td>0.0</td>\n",
" <td>0.2</td>\n",
" <td>-12.7</td>\n",
" <td>...</td>\n",
" <td>0.091256</td>\n",
" <td>0.827110</td>\n",
" <td>0.424400</td>\n",
" <td>97.0</td>\n",
" <td>154.0</td>\n",
" <td>306.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>8.0</td>\n",
" <td>3.51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.454347</td>\n",
" <td>4.941642</td>\n",
" <td>5.680173</td>\n",
" <td>3.637586</td>\n",
" <td>2.079442</td>\n",
" <td>2.197225</td>\n",
" <td>1.530395</td>\n",
" <td>199.0</td>\n",
" <td>1.4</td>\n",
" <td>-10.9</td>\n",
" <td>...</td>\n",
" <td>0.092434</td>\n",
" <td>1.746121</td>\n",
" <td>0.459282</td>\n",
" <td>87.0</td>\n",
" <td>141.0</td>\n",
" <td>316.0</td>\n",
" <td>38.0</td>\n",
" <td>7.0</td>\n",
" <td>8.0</td>\n",
" <td>3.55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.672829</td>\n",
" <td>5.123964</td>\n",
" <td>5.758902</td>\n",
" <td>3.637586</td>\n",
" <td>2.197225</td>\n",
" <td>2.197225</td>\n",
" <td>1.605430</td>\n",
" <td>359.0</td>\n",
" <td>1.2</td>\n",
" <td>-12.3</td>\n",
" <td>...</td>\n",
" <td>0.170738</td>\n",
" <td>3.446292</td>\n",
" <td>0.514513</td>\n",
" <td>85.0</td>\n",
" <td>139.0</td>\n",
" <td>292.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>8.0</td>\n",
" <td>3.62</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 49 columns</p>\n",
"</div>"
],
"text/plain": [
" PM2.5 PM10 SO2 NO2 O3 O3_8h CO \\\n",
"0 4.744932 5.176150 5.723585 3.663562 2.197225 2.302585 1.515127 \n",
"1 4.584967 5.043425 5.726848 3.637586 2.079442 2.197225 1.506297 \n",
"2 4.477337 4.955827 5.758902 3.663562 2.079442 2.197225 1.515127 \n",
"3 4.454347 4.941642 5.680173 3.637586 2.079442 2.197225 1.530395 \n",
"4 4.672829 5.123964 5.758902 3.637586 2.197225 2.197225 1.605430 \n",
"\n",
" wd ws air_temp ... PM2.5_transportation PM2.5_resdient \\\n",
"0 58.0 0.7 -11.1 ... 0.081248 0.827110 \n",
"1 185.0 0.5 -11.7 ... 0.088313 0.827110 \n",
"2 0.0 0.2 -12.7 ... 0.091256 0.827110 \n",
"3 199.0 1.4 -10.9 ... 0.092434 1.746121 \n",
"4 359.0 1.2 -12.3 ... 0.170738 3.446292 \n",
"\n",
" PM2.5_power pre_PM2.5 pre_PM10 pre_SO2 pre_NO2 pre_O3 pre_O3_8h \\\n",
"0 0.418587 136.0 214.0 317.0 38.0 8.0 9.0 \n",
"1 0.412773 114.0 176.0 305.0 38.0 8.0 9.0 \n",
"2 0.424400 97.0 154.0 306.0 37.0 7.0 8.0 \n",
"3 0.459282 87.0 141.0 316.0 38.0 7.0 8.0 \n",
"4 0.514513 85.0 139.0 292.0 37.0 7.0 8.0 \n",
"\n",
" pre_CO \n",
"0 3.71 \n",
"1 3.55 \n",
"2 3.51 \n",
"3 3.55 \n",
"4 3.62 \n",
"\n",
"[5 rows x 49 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('./data/train_data_mod.csv')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'O3_8h', 'CO'], dtype='object')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_cols = data.columns[7:]\n",
"out_cols = data.columns[:7]\n",
"out_cols"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"\n",
"train_X, test_X, train_y, test_y = train_test_split(data[feature_cols], data[out_cols], test_size=0.2,\n",
" random_state=42)\n",
"#准备参数\n",
"other_params = {'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,\n",
" 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}\n",
"\n",
"params_gbm = {\n",
" 'task': 'train',\n",
" 'boosting_type': 'gbdt', # 设置提升类型\n",
" 'objective': 'l1', # 目标函数\n",
" 'metric': 'rmse', # 评估函数\n",
" 'max_depth': 10,\n",
" 'num_leaves': 20, # 叶子节点数\n",
" 'learning_rate': 0.09, # 学习速率\n",
" 'feature_fraction': 0.9, # 建树的特征选择比例\n",
" 'bagging_fraction': 0.9, # 建树的样本采样比例\n",
" 'bagging_freq': 10, # k 意味着每 k 次迭代执行bagging\n",
" 'verbose': -1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import lightgbm as lgb\n",
"from sklearn.multioutput import MultiOutputRegressor\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"base_model = lgb.LGBMRegressor(**params_gbm)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"base_cat = CatBoostRegressor(iterations=1000, learning_rate=0.0005, depth=10, loss_function='RMSE', eval_metric='RMSE', random_seed=99, od_type='Iter', od_wait=50, verbose=0)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"multioutputregressor = MultiOutputRegressor(base_cat).fit(train_X, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"rst = multioutputregressor.predict(test_X)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PM2.5</th>\n",
" <th>PM10</th>\n",
" <th>SO2</th>\n",
" <th>NO2</th>\n",
" <th>O3</th>\n",
" <th>O3_8h</th>\n",
" <th>CO</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3.653185</td>\n",
" <td>4.700200</td>\n",
" <td>2.722381</td>\n",
" <td>3.261589</td>\n",
" <td>3.836444</td>\n",
" <td>3.857181</td>\n",
" <td>0.615219</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.323887</td>\n",
" <td>4.923196</td>\n",
" <td>3.198502</td>\n",
" <td>4.016752</td>\n",
" <td>3.166474</td>\n",
" <td>3.591639</td>\n",
" <td>0.824886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.660165</td>\n",
" <td>4.662362</td>\n",
" <td>3.136948</td>\n",
" <td>3.513742</td>\n",
" <td>3.763910</td>\n",
" <td>3.671770</td>\n",
" <td>0.631061</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.728112</td>\n",
" <td>4.645958</td>\n",
" <td>3.514411</td>\n",
" <td>3.718547</td>\n",
" <td>3.199907</td>\n",
" <td>3.291750</td>\n",
" <td>0.862777</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.189668</td>\n",
" <td>4.743439</td>\n",
" <td>3.445615</td>\n",
" <td>3.674801</td>\n",
" <td>3.949052</td>\n",
" <td>3.695285</td>\n",
" <td>0.797655</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8902</th>\n",
" <td>4.283530</td>\n",
" <td>4.995899</td>\n",
" <td>4.019444</td>\n",
" <td>3.961054</td>\n",
" <td>3.663294</td>\n",
" <td>3.314231</td>\n",
" <td>0.916339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8903</th>\n",
" <td>3.674866</td>\n",
" <td>4.606504</td>\n",
" <td>3.470283</td>\n",
" <td>3.307148</td>\n",
" <td>3.608739</td>\n",
" <td>3.626463</td>\n",
" <td>0.860751</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8904</th>\n",
" <td>3.704409</td>\n",
" <td>4.350563</td>\n",
" <td>3.757374</td>\n",
" <td>3.636318</td>\n",
" <td>3.601366</td>\n",
" <td>3.539929</td>\n",
" <td>0.862651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8905</th>\n",
" <td>3.724967</td>\n",
" <td>4.673218</td>\n",
" <td>3.218182</td>\n",
" <td>3.765976</td>\n",
" <td>3.386151</td>\n",
" <td>2.954136</td>\n",
" <td>0.730686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8906</th>\n",
" <td>3.419836</td>\n",
" <td>4.188060</td>\n",
" <td>3.019416</td>\n",
" <td>3.307404</td>\n",
" <td>3.861704</td>\n",
" <td>3.746484</td>\n",
" <td>0.702885</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8907 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" PM2.5 PM10 SO2 NO2 O3 O3_8h CO\n",
"0 3.653185 4.700200 2.722381 3.261589 3.836444 3.857181 0.615219\n",
"1 4.323887 4.923196 3.198502 4.016752 3.166474 3.591639 0.824886\n",
"2 3.660165 4.662362 3.136948 3.513742 3.763910 3.671770 0.631061\n",
"3 3.728112 4.645958 3.514411 3.718547 3.199907 3.291750 0.862777\n",
"4 4.189668 4.743439 3.445615 3.674801 3.949052 3.695285 0.797655\n",
"... ... ... ... ... ... ... ...\n",
"8902 4.283530 4.995899 4.019444 3.961054 3.663294 3.314231 0.916339\n",
"8903 3.674866 4.606504 3.470283 3.307148 3.608739 3.626463 0.860751\n",
"8904 3.704409 4.350563 3.757374 3.636318 3.601366 3.539929 0.862651\n",
"8905 3.724967 4.673218 3.218182 3.765976 3.386151 2.954136 0.730686\n",
"8906 3.419836 4.188060 3.019416 3.307404 3.861704 3.746484 0.702885\n",
"\n",
"[8907 rows x 7 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out_results = pd.DataFrame(rst, columns=out_cols)\n",
"out_results"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COL: PM2.5, MSE: 2.30E-01,RMSE: 0.4799,MAPE: 9.99 %,MAE: 0.3802,R_2: -2.1369\n",
"COL: PM10, MSE: 1.64E-01,RMSE: 0.4053,MAPE: 6.97 %,MAE: 0.3164,R_2: -2.1513\n",
"COL: SO2, MSE: 4.32E-01,RMSE: 0.6574,MAPE: 16.439999999999998 %,MAE: 0.5326,R_2: -2.0811\n",
"COL: NO2, MSE: 1.48E-01,RMSE: 0.3843,MAPE: 8.52 %,MAE: 0.3095,R_2: -2.3884\n",
"COL: O3, MSE: 4.99E-01,RMSE: 0.7061,MAPE: 17.419999999999998 %,MAE: 0.5898,R_2: -2.0369\n",
"COL: O3_8h, MSE: 4.19E-01,RMSE: 0.6471,MAPE: 15.73 %,MAE: 0.5331,R_2: -1.936\n",
"COL: CO, MSE: 3.39E-02,RMSE: 0.1842,MAPE: 18.75 %,MAE: 0.1439,R_2: -2.1239\n"
]
}
],
"source": [
"for col in out_cols:\n",
" MSE = mean_squared_error(out_results[col].values, test_y[col].values)\n",
" RMSE = np.sqrt(MSE)\n",
" MAE = mean_absolute_error(out_results[col].values, test_y[col].values)\n",
" MAPE = mean_absolute_percentage_error(out_results[col].values, test_y[col].values)\n",
" R_2 = r2_score(out_results[col].values, test_y[col].values)\n",
" print(f\"COL: {col}, MSE: {format(MSE, '.2E')}\", end=',')\n",
" print(f'RMSE: {round(RMSE, 4)}', end=',')\n",
" print(f'MAPE: {round(MAPE, 4) * 100} %', end=',')\n",
" print(f'MAE: {round(MAE, 4)}', end=',')\n",
" print(f'R_2: {round(R_2, 4)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py37",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "993bd31d5df1020fab369d79a34ff0a2a159e1798f3e25d3ad4b7751d38184c9"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

BIN
figure/lookback/CO.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

BIN
figure/lookback/NO2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 MiB

BIN
figure/lookback/O3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

BIN
figure/lookback/PM10.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

BIN
figure/lookback/PM25.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

BIN
figure/lookback/SO2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

BIN
figure/looknow/CO.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

BIN
figure/looknow/NO2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 MiB

BIN
figure/looknow/O3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

BIN
figure/looknow/PM10.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 MiB

BIN
figure/looknow/PM25.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

BIN
figure/looknow/SO2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

1183
keras_multi-attention.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1934
keras_test.ipynb Normal file

File diff suppressed because one or more lines are too long

2054
mlp85.ipynb Normal file

File diff suppressed because it is too large Load Diff

68
multi-task-learning.ipynb Normal file
View File

@ -0,0 +1,68 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "30222419-bfdf-4c4b-ada1-5aafbf27bc88",
"metadata": {},
"source": [
"按照图示完成多任务学习的网络结构,部分网络层共享参数"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "573727b3-ebeb-4a34-ad53-7596363052fe",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/zhaojh/miniconda3/envs/py37/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import torch\n",
"from torch import nn"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d08f996-b93e-4e42-a889-b85aa438f4df",
"metadata": {},
"outputs": [],
"source": [
"class MTLNN(nn.Module):\n",
" def __init__(self):\n",
" super(MTLNN, self).__init__()\n",
" self.main_branch = nn.Sequential(\n",
" nn.\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py37",
"language": "python",
"name": "py37"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

166
prophet_city_hours.py Normal file
View File

@ -0,0 +1,166 @@
from cmath import log
import pandas as pd
import os
import numpy as np
from prophet import Prophet
import datetime as dt
from get_holiday_cn.client import getHoliday
from logzero import logger
import pickle
import matplotlib.pyplot as plt
def concat_date(x:str, y:str):
"""_summary_
Args:
x (str): 年月日
y (str): 小时
Returns:
_type_: 合成的时间
"""
time_str = f"{x} {y}:00:00"
return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")
def load_data():
data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')]
data_folder.sort()
# 一个读取数据并合成成一个大文件的函数
total_data = pd.DataFrame()
for folder in data_folder:
files = os.listdir(f"./data/{folder}")
files.sort()
for file in files:
if file.endswith('csv'):
data = pd.read_csv(f'./data/{folder}/{file}')
use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy()
total_data = pd.concat([total_data, use_data])
total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1)
total_data.ds = pd.to_datetime(total_data.ds)
total_data.sort_values(by='ds', ascending=True, inplace=True)
total_data.reset_index(drop=True, inplace=True)
logger.info(f"总数据集大小:{total_data.shape}")
return total_data
def build_model(city: str, data: pd.DataFrame, dtype:str, holiday_mode:dict, split_date="2021-01-01 00:00:00"):
"""_summary_
Args:
city (str): 城市
data (pd.DataFrame): 数据
dtype (str): O3还是PM2.5
holiday_mode (dict): 假期字典
split_date (str, optional): 划分训练测试的分割日期. Defaults to "2021-01-01".
Returns:
model: 模型
forecast: 对该组数据的预测
"""
logger.info(f"选择了 {city}{dtype} 数据,")
use_data = data[(data['type']==dtype)][["ds", city]].copy()
use_data.columns = ["ds", "y"]
train_data = use_data[use_data.ds < split_date].copy()
logger.info(train_data.iloc[-1].ds)
test_data = use_data[use_data.ds >= split_date].copy()
model=Prophet(
growth="linear",
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=True,
seasonality_mode="multiplicative",
seasonality_prior_scale=12,
holidays=holiday_mode,
n_changepoints= 100, # change points num, default=25
)
model.fit(train_data)
future = model.make_future_dataframe(365*24, freq='H', include_history=True)
forecast=model.predict(future)
model.plot_components(forecast)
plt.savefig(f'./figure/{city}_{dtype}_components.png')
return model, forecast
def get_date_type(date:str, holiday_client:getHoliday):
"""一个判断某个日期是哪种假期的类
Args:
date (str): "YYYY-MM-DD"
holiday_client (getHoliday): object of getHoliday class
Returns:
str: oridinary for simple day and others for special day
"""
rst = holiday_client.assemble_holiday_data(today=date)
if rst.get('code') == 0:
if rst.get('holiday') is None:
return 'oridinary'
else:
return rst.get('holiday').get('name')
def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"):
"""基于起止日期,将该时间段内的国内假期都找出来,包括本应该放假但是最后调休上班的
Args:
start_date (str): "YYYY-MM-DD"形式的字符串 默认2015-01-01
end_date (_type_): "YYYY-MM-DD"形式的字符串默认2021-12-31
Returns:
_type_: _description_
"""
ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date'])
ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d'))
client = getHoliday()
ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
special_date = ds_list[ds_list.day_type != 'simple'].copy()
special_date.columns = ['ds', 'holiday']
return special_date
def train(data_type, city_list, data):
model_dict = dict()
predict_dict = dict()
holiday_data = build_holiday(data.ds.min(), data.ds.max())
for city in city_list:
model, pred = build_model(city, data, data_type, holiday_data, '2021-01-01')
model_dict[city] = model
predict_dict[city] = pred
logger.info(f"{city} 模型构建完成")
if not os.path.exists('./result/'):
os.mkdir('./result/')
if not os.path.exists(f'./result/{data_type}/'):
os.mkdir(f'./result/{data_type}')
if not os.path.exists(f'./result/{data_type}/model/'):
os.mkdir(f'./result/{data_type}/model')
if not os.path.exists(f'./result/{data_type}/data/'):
os.mkdir(f'./result/{data_type}/data/')
for city in predict_dict:
city_pred = predict_dict.get(city)
city_pred.to_csv(f'./result/{data_type}/data/{city}.csv', encoding='utf-8', index=False)
logger.info(f"{city} 预测数据保存完成")
for city in model_dict:
city_model = model_dict.get(city)
with open(f'./result/{data_type}/model/{city}.pkl', 'wb') as fwb:
pickle.dump(city_model, fwb)
logger.info(f"{city} 模型保存完成")
return model_dict, predict_dict
if __name__ == '__main__':
data_type = 'O3' # 修改此处以切换数据类型
city_list = ['北京'] # 修改此处以添加城市
if os.path.exists('./data/total_data.csv'):
data = pd.read_csv('./data/total_data.csv')
else:
data = load_data()
data.to_csv('./data/total_data.csv', encoding='utf-8', index=False)
model_dict, pred_list = train(data_type, city_list, data)
'''
# if test
# 从存储的模型中加载
with open('./result/O3/model/北京.pkl', 'rb') as fr:
local_model = pickle.load(fr)
'''

33
prophet_test.py Normal file
View File

@ -0,0 +1,33 @@
import pandas as pd
import os
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
path="E:/Predition/data/environment_data/城市/处理表头数据/daily_results/normal_daily_results"
df = pd.read_csv('E:/Predition/data/environment_data/城市/处理表头数据/daily_results/normal_daily_results/test.csv',index_col= 'date',parse_dates=['date'])
print(df.shape)
df.reset_index(inplace=True)
df.columns = ["ds", "y"]
output_file = os.path.join(path, 'test-output.csv')
df.to_csv(output_file)
train = df[:int(df.shape[0]*0.7)]
test=df[int(df.shape[0]*0.7):]
train.shape, test.shape
model=Prophet(growth="linear",
yearly_seasonality=True,
weekly_seasonality=False,
daily_seasonality=False,
seasonality_mode="multiplicative",
seasonality_prior_scale=12,
)
model.fit(train)
forecast=model.predict(test)
print(forecast)
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
new_y = df.y.fillna('0')
df['new_y'] = new_y.values
df.dropna()
na_index =test[test.y.isna()].ds.values
na_index
forecast.tail()
print("在测试集上绝对值预测误差为:",mean_absolute_error(new_df.y.values,forecast.yhat.values))

1435
test.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

918
时间插值.ipynb Normal file
View File

@ -0,0 +1,918 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(52583, 54)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('./data/ori_data.csv')\n",
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>PM2.5</th>\n",
" <th>PM10</th>\n",
" <th>SO2</th>\n",
" <th>NO2</th>\n",
" <th>O3</th>\n",
" <th>CO</th>\n",
" <th>Ox</th>\n",
" <th>wind-U</th>\n",
" <th>wind-V</th>\n",
" <th>...</th>\n",
" <th>VOC_power</th>\n",
" <th>VOC_agricultural</th>\n",
" <th>PM2.5_industrial</th>\n",
" <th>PM2.5_transportation</th>\n",
" <th>PM2.5_resdient</th>\n",
" <th>PM2.5_power</th>\n",
" <th>PM2.5_agricultural</th>\n",
" <th>CO_Bio</th>\n",
" <th>VOCs_Bio</th>\n",
" <th>pre_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-01-02 01:00:00</td>\n",
" <td>136.0</td>\n",
" <td>214.0</td>\n",
" <td>317.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.71</td>\n",
" <td>46.0</td>\n",
" <td>0.831775</td>\n",
" <td>-0.555113</td>\n",
" <td>...</td>\n",
" <td>0.037724</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.077715</td>\n",
" <td>0.827110</td>\n",
" <td>0.436028</td>\n",
" <td>0.0</td>\n",
" <td>0.081546</td>\n",
" <td>4.217706</td>\n",
" <td>2015-01-02 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-01-02 02:00:00</td>\n",
" <td>114.0</td>\n",
" <td>176.0</td>\n",
" <td>305.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.55</td>\n",
" <td>46.0</td>\n",
" <td>-0.695011</td>\n",
" <td>-0.083426</td>\n",
" <td>...</td>\n",
" <td>0.036215</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.081248</td>\n",
" <td>0.827110</td>\n",
" <td>0.418587</td>\n",
" <td>0.0</td>\n",
" <td>0.080031</td>\n",
" <td>4.119807</td>\n",
" <td>2015-01-02 01:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-01-02 03:00:00</td>\n",
" <td>97.0</td>\n",
" <td>154.0</td>\n",
" <td>306.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.51</td>\n",
" <td>44.0</td>\n",
" <td>-0.173311</td>\n",
" <td>0.469003</td>\n",
" <td>...</td>\n",
" <td>0.035712</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.088313</td>\n",
" <td>0.827110</td>\n",
" <td>0.412773</td>\n",
" <td>0.0</td>\n",
" <td>0.077761</td>\n",
" <td>3.973464</td>\n",
" <td>2015-01-02 02:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2015-01-02 04:00:00</td>\n",
" <td>87.0</td>\n",
" <td>141.0</td>\n",
" <td>316.0</td>\n",
" <td>38.0</td>\n",
" <td>7.0</td>\n",
" <td>3.55</td>\n",
" <td>45.0</td>\n",
" <td>0.000000</td>\n",
" <td>-0.200000</td>\n",
" <td>...</td>\n",
" <td>0.036718</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.091256</td>\n",
" <td>0.827110</td>\n",
" <td>0.424400</td>\n",
" <td>0.0</td>\n",
" <td>0.076766</td>\n",
" <td>3.909235</td>\n",
" <td>2015-01-02 03:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2015-01-02 05:00:00</td>\n",
" <td>85.0</td>\n",
" <td>139.0</td>\n",
" <td>292.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.62</td>\n",
" <td>44.0</td>\n",
" <td>1.234518</td>\n",
" <td>0.660276</td>\n",
" <td>...</td>\n",
" <td>0.039736</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.092434</td>\n",
" <td>1.746121</td>\n",
" <td>0.459282</td>\n",
" <td>0.0</td>\n",
" <td>0.077119</td>\n",
" <td>3.930702</td>\n",
" <td>2015-01-02 04:00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 54 columns</p>\n",
"</div>"
],
"text/plain": [
" date PM2.5 PM10 SO2 NO2 O3 CO Ox wind-U \\\n",
"0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 0.831775 \n",
"1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 -0.695011 \n",
"2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 -0.173311 \n",
"3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 0.000000 \n",
"4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 1.234518 \n",
"\n",
" wind-V ... VOC_power VOC_agricultural PM2.5_industrial \\\n",
"0 -0.555113 ... 0.037724 0.0 0.926851 \n",
"1 -0.083426 ... 0.036215 0.0 0.926851 \n",
"2 0.469003 ... 0.035712 0.0 0.926851 \n",
"3 -0.200000 ... 0.036718 0.0 0.926851 \n",
"4 0.660276 ... 0.039736 0.0 0.926851 \n",
"\n",
" PM2.5_transportation PM2.5_resdient PM2.5_power PM2.5_agricultural \\\n",
"0 0.077715 0.827110 0.436028 0.0 \n",
"1 0.081248 0.827110 0.418587 0.0 \n",
"2 0.088313 0.827110 0.412773 0.0 \n",
"3 0.091256 0.827110 0.424400 0.0 \n",
"4 0.092434 1.746121 0.459282 0.0 \n",
"\n",
" CO_Bio VOCs_Bio pre_time \n",
"0 0.081546 4.217706 2015-01-02 00:00:00 \n",
"1 0.080031 4.119807 2015-01-02 01:00:00 \n",
"2 0.077761 3.973464 2015-01-02 02:00:00 \n",
"3 0.076766 3.909235 2015-01-02 03:00:00 \n",
"4 0.077119 3.930702 2015-01-02 04:00:00 \n",
"\n",
"[5 rows x 54 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data.date = pd.to_datetime(data.date)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import datetime as dt"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_lookdays(x):\n",
" start = x - dt.timedelta(hours=24)\n",
" end = x - dt.timedelta(hours=1)\n",
" period = pd.date_range(start, end, freq='H')\n",
" return [dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S') for x in period.tolist()]\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"date_range = pd.date_range(data.date.min(), data.date.max(), freq='H')\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'CO']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out_cols = data.columns[1:7].tolist()\n",
"out_cols"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data['day'] = data.date.apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))\n",
"na_counts = data.set_index('day')[out_cols].isna().groupby('day').sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"drop_days = na_counts[na_counts.SO2>5].index.values"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"data = data.set_index('date').interpolate(method='linear')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"data = data.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"data = data[~data.date.isin(drop_days)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"data = data.set_index('date').reindex(date_range)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"data.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"data.rename(columns={'index':'date'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"data['lookdays'] = data.date.apply(get_lookdays)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"data['features'] = data.lookdays.apply(lambda x: data[data.date.isin(x)][out_cols].values.reshape(-1,).tolist())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"data['feature_len'] = data.features.apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"save_data = data[data.feature_len >=144].copy()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"pre_cols = list()\n",
"for i in range(24, 0, -1):\n",
" for j in out_cols:\n",
" pre_cols.append(f\"{i}_{j}\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"previous_out = save_data.features.apply(pd.Series, index=pre_cols)\n",
"previous_out['date'] = save_data.date.values"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>PM2.5</th>\n",
" <th>PM10</th>\n",
" <th>SO2</th>\n",
" <th>NO2</th>\n",
" <th>O3</th>\n",
" <th>CO</th>\n",
" <th>Ox</th>\n",
" <th>wind-U</th>\n",
" <th>wind-V</th>\n",
" <th>...</th>\n",
" <th>2_SO2</th>\n",
" <th>2_NO2</th>\n",
" <th>2_O3</th>\n",
" <th>2_CO</th>\n",
" <th>1_PM2.5</th>\n",
" <th>1_PM10</th>\n",
" <th>1_SO2</th>\n",
" <th>1_NO2</th>\n",
" <th>1_O3</th>\n",
" <th>1_CO</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-01-02 01:00:00</td>\n",
" <td>136.0</td>\n",
" <td>214.0</td>\n",
" <td>317.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.71</td>\n",
" <td>46.0</td>\n",
" <td>0.831775</td>\n",
" <td>-0.555113</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-01-02 02:00:00</td>\n",
" <td>114.0</td>\n",
" <td>176.0</td>\n",
" <td>305.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.55</td>\n",
" <td>46.0</td>\n",
" <td>-0.695011</td>\n",
" <td>-0.083426</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-01-02 03:00:00</td>\n",
" <td>97.0</td>\n",
" <td>154.0</td>\n",
" <td>306.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.51</td>\n",
" <td>44.0</td>\n",
" <td>-0.173311</td>\n",
" <td>0.469003</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2015-01-02 04:00:00</td>\n",
" <td>87.0</td>\n",
" <td>141.0</td>\n",
" <td>316.0</td>\n",
" <td>38.0</td>\n",
" <td>7.0</td>\n",
" <td>3.55</td>\n",
" <td>45.0</td>\n",
" <td>0.000000</td>\n",
" <td>-0.200000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2015-01-02 05:00:00</td>\n",
" <td>85.0</td>\n",
" <td>139.0</td>\n",
" <td>292.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.62</td>\n",
" <td>44.0</td>\n",
" <td>1.234518</td>\n",
" <td>0.660276</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52578</th>\n",
" <td>2020-12-31 19:00:00</td>\n",
" <td>27.0</td>\n",
" <td>51.0</td>\n",
" <td>16.0</td>\n",
" <td>46.0</td>\n",
" <td>29.0</td>\n",
" <td>0.72</td>\n",
" <td>75.0</td>\n",
" <td>1.067581</td>\n",
" <td>-0.265087</td>\n",
" <td>...</td>\n",
" <td>25.0</td>\n",
" <td>34.0</td>\n",
" <td>43.0</td>\n",
" <td>0.75</td>\n",
" <td>31.0</td>\n",
" <td>59.0</td>\n",
" <td>21.0</td>\n",
" <td>47.0</td>\n",
" <td>29.0</td>\n",
" <td>0.91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52579</th>\n",
" <td>2020-12-31 20:00:00</td>\n",
" <td>26.0</td>\n",
" <td>51.0</td>\n",
" <td>12.0</td>\n",
" <td>47.0</td>\n",
" <td>26.0</td>\n",
" <td>0.83</td>\n",
" <td>73.0</td>\n",
" <td>0.029164</td>\n",
" <td>0.298579</td>\n",
" <td>...</td>\n",
" <td>21.0</td>\n",
" <td>47.0</td>\n",
" <td>29.0</td>\n",
" <td>0.91</td>\n",
" <td>27.0</td>\n",
" <td>51.0</td>\n",
" <td>16.0</td>\n",
" <td>46.0</td>\n",
" <td>29.0</td>\n",
" <td>0.72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52580</th>\n",
" <td>2020-12-31 21:00:00</td>\n",
" <td>29.0</td>\n",
" <td>58.0</td>\n",
" <td>16.0</td>\n",
" <td>48.0</td>\n",
" <td>25.0</td>\n",
" <td>1.15</td>\n",
" <td>73.0</td>\n",
" <td>-0.079532</td>\n",
" <td>0.896479</td>\n",
" <td>...</td>\n",
" <td>16.0</td>\n",
" <td>46.0</td>\n",
" <td>29.0</td>\n",
" <td>0.72</td>\n",
" <td>26.0</td>\n",
" <td>51.0</td>\n",
" <td>12.0</td>\n",
" <td>47.0</td>\n",
" <td>26.0</td>\n",
" <td>0.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52581</th>\n",
" <td>2020-12-31 22:00:00</td>\n",
" <td>32.0</td>\n",
" <td>60.0</td>\n",
" <td>23.0</td>\n",
" <td>49.0</td>\n",
" <td>20.0</td>\n",
" <td>0.90</td>\n",
" <td>69.0</td>\n",
" <td>-1.660193</td>\n",
" <td>0.365729</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>47.0</td>\n",
" <td>26.0</td>\n",
" <td>0.83</td>\n",
" <td>29.0</td>\n",
" <td>58.0</td>\n",
" <td>16.0</td>\n",
" <td>48.0</td>\n",
" <td>25.0</td>\n",
" <td>1.15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52582</th>\n",
" <td>2020-12-31 23:00:00</td>\n",
" <td>53.0</td>\n",
" <td>94.0</td>\n",
" <td>41.0</td>\n",
" <td>57.0</td>\n",
" <td>12.0</td>\n",
" <td>1.18</td>\n",
" <td>69.0</td>\n",
" <td>-0.106042</td>\n",
" <td>1.195305</td>\n",
" <td>...</td>\n",
" <td>16.0</td>\n",
" <td>48.0</td>\n",
" <td>25.0</td>\n",
" <td>1.15</td>\n",
" <td>32.0</td>\n",
" <td>60.0</td>\n",
" <td>23.0</td>\n",
" <td>49.0</td>\n",
" <td>20.0</td>\n",
" <td>0.90</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>52583 rows × 200 columns</p>\n",
"</div>"
],
"text/plain": [
" date PM2.5 PM10 SO2 NO2 O3 CO Ox \\\n",
"0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 \n",
"1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 \n",
"2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 \n",
"3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 \n",
"4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 \n",
"... ... ... ... ... ... ... ... ... \n",
"52578 2020-12-31 19:00:00 27.0 51.0 16.0 46.0 29.0 0.72 75.0 \n",
"52579 2020-12-31 20:00:00 26.0 51.0 12.0 47.0 26.0 0.83 73.0 \n",
"52580 2020-12-31 21:00:00 29.0 58.0 16.0 48.0 25.0 1.15 73.0 \n",
"52581 2020-12-31 22:00:00 32.0 60.0 23.0 49.0 20.0 0.90 69.0 \n",
"52582 2020-12-31 23:00:00 53.0 94.0 41.0 57.0 12.0 1.18 69.0 \n",
"\n",
" wind-U wind-V ... 2_SO2 2_NO2 2_O3 2_CO 1_PM2.5 1_PM10 \\\n",
"0 0.831775 -0.555113 ... NaN NaN NaN NaN NaN NaN \n",
"1 -0.695011 -0.083426 ... NaN NaN NaN NaN NaN NaN \n",
"2 -0.173311 0.469003 ... NaN NaN NaN NaN NaN NaN \n",
"3 0.000000 -0.200000 ... NaN NaN NaN NaN NaN NaN \n",
"4 1.234518 0.660276 ... NaN NaN NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... ... ... ... \n",
"52578 1.067581 -0.265087 ... 25.0 34.0 43.0 0.75 31.0 59.0 \n",
"52579 0.029164 0.298579 ... 21.0 47.0 29.0 0.91 27.0 51.0 \n",
"52580 -0.079532 0.896479 ... 16.0 46.0 29.0 0.72 26.0 51.0 \n",
"52581 -1.660193 0.365729 ... 12.0 47.0 26.0 0.83 29.0 58.0 \n",
"52582 -0.106042 1.195305 ... 16.0 48.0 25.0 1.15 32.0 60.0 \n",
"\n",
" 1_SO2 1_NO2 1_O3 1_CO \n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"... ... ... ... ... \n",
"52578 21.0 47.0 29.0 0.91 \n",
"52579 16.0 46.0 29.0 0.72 \n",
"52580 12.0 47.0 26.0 0.83 \n",
"52581 16.0 48.0 25.0 1.15 \n",
"52582 23.0 49.0 20.0 0.90 \n",
"\n",
"[52583 rows x 200 columns]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop(columns=['features', 'feature_len']).merge(previous_out, on='date', how='left')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"new_data = previous_out.merge(data.drop(columns=['features', 'feature_len', 'lookdays', 'pre_time']))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"drop_cols = [x for x in new_data.columns if 'agricultural' in x] + ['NH3_power'] + ['CO_Bio', 'VOCs_Bio']\n",
"drop_cols.remove('NH3_agricultural')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"new_data.drop(columns=drop_cols, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"new_data.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"new_data.drop(columns=['day'], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(49014, 188)"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"new_data.set_index('date').to_csv('new_train_data.csv', encoding='utf-8-sig')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py37",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}