22-T67/时间插值.ipynb

919 lines
27 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(52583, 54)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('./data/ori_data.csv')\n",
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>PM2.5</th>\n",
" <th>PM10</th>\n",
" <th>SO2</th>\n",
" <th>NO2</th>\n",
" <th>O3</th>\n",
" <th>CO</th>\n",
" <th>Ox</th>\n",
" <th>wind-U</th>\n",
" <th>wind-V</th>\n",
" <th>...</th>\n",
" <th>VOC_power</th>\n",
" <th>VOC_agricultural</th>\n",
" <th>PM2.5_industrial</th>\n",
" <th>PM2.5_transportation</th>\n",
" <th>PM2.5_resdient</th>\n",
" <th>PM2.5_power</th>\n",
" <th>PM2.5_agricultural</th>\n",
" <th>CO_Bio</th>\n",
" <th>VOCs_Bio</th>\n",
" <th>pre_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-01-02 01:00:00</td>\n",
" <td>136.0</td>\n",
" <td>214.0</td>\n",
" <td>317.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.71</td>\n",
" <td>46.0</td>\n",
" <td>0.831775</td>\n",
" <td>-0.555113</td>\n",
" <td>...</td>\n",
" <td>0.037724</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.077715</td>\n",
" <td>0.827110</td>\n",
" <td>0.436028</td>\n",
" <td>0.0</td>\n",
" <td>0.081546</td>\n",
" <td>4.217706</td>\n",
" <td>2015-01-02 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-01-02 02:00:00</td>\n",
" <td>114.0</td>\n",
" <td>176.0</td>\n",
" <td>305.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.55</td>\n",
" <td>46.0</td>\n",
" <td>-0.695011</td>\n",
" <td>-0.083426</td>\n",
" <td>...</td>\n",
" <td>0.036215</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.081248</td>\n",
" <td>0.827110</td>\n",
" <td>0.418587</td>\n",
" <td>0.0</td>\n",
" <td>0.080031</td>\n",
" <td>4.119807</td>\n",
" <td>2015-01-02 01:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-01-02 03:00:00</td>\n",
" <td>97.0</td>\n",
" <td>154.0</td>\n",
" <td>306.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.51</td>\n",
" <td>44.0</td>\n",
" <td>-0.173311</td>\n",
" <td>0.469003</td>\n",
" <td>...</td>\n",
" <td>0.035712</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.088313</td>\n",
" <td>0.827110</td>\n",
" <td>0.412773</td>\n",
" <td>0.0</td>\n",
" <td>0.077761</td>\n",
" <td>3.973464</td>\n",
" <td>2015-01-02 02:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2015-01-02 04:00:00</td>\n",
" <td>87.0</td>\n",
" <td>141.0</td>\n",
" <td>316.0</td>\n",
" <td>38.0</td>\n",
" <td>7.0</td>\n",
" <td>3.55</td>\n",
" <td>45.0</td>\n",
" <td>0.000000</td>\n",
" <td>-0.200000</td>\n",
" <td>...</td>\n",
" <td>0.036718</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.091256</td>\n",
" <td>0.827110</td>\n",
" <td>0.424400</td>\n",
" <td>0.0</td>\n",
" <td>0.076766</td>\n",
" <td>3.909235</td>\n",
" <td>2015-01-02 03:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2015-01-02 05:00:00</td>\n",
" <td>85.0</td>\n",
" <td>139.0</td>\n",
" <td>292.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.62</td>\n",
" <td>44.0</td>\n",
" <td>1.234518</td>\n",
" <td>0.660276</td>\n",
" <td>...</td>\n",
" <td>0.039736</td>\n",
" <td>0.0</td>\n",
" <td>0.926851</td>\n",
" <td>0.092434</td>\n",
" <td>1.746121</td>\n",
" <td>0.459282</td>\n",
" <td>0.0</td>\n",
" <td>0.077119</td>\n",
" <td>3.930702</td>\n",
" <td>2015-01-02 04:00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 54 columns</p>\n",
"</div>"
],
"text/plain": [
" date PM2.5 PM10 SO2 NO2 O3 CO Ox wind-U \\\n",
"0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 0.831775 \n",
"1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 -0.695011 \n",
"2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 -0.173311 \n",
"3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 0.000000 \n",
"4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 1.234518 \n",
"\n",
" wind-V ... VOC_power VOC_agricultural PM2.5_industrial \\\n",
"0 -0.555113 ... 0.037724 0.0 0.926851 \n",
"1 -0.083426 ... 0.036215 0.0 0.926851 \n",
"2 0.469003 ... 0.035712 0.0 0.926851 \n",
"3 -0.200000 ... 0.036718 0.0 0.926851 \n",
"4 0.660276 ... 0.039736 0.0 0.926851 \n",
"\n",
" PM2.5_transportation PM2.5_resdient PM2.5_power PM2.5_agricultural \\\n",
"0 0.077715 0.827110 0.436028 0.0 \n",
"1 0.081248 0.827110 0.418587 0.0 \n",
"2 0.088313 0.827110 0.412773 0.0 \n",
"3 0.091256 0.827110 0.424400 0.0 \n",
"4 0.092434 1.746121 0.459282 0.0 \n",
"\n",
" CO_Bio VOCs_Bio pre_time \n",
"0 0.081546 4.217706 2015-01-02 00:00:00 \n",
"1 0.080031 4.119807 2015-01-02 01:00:00 \n",
"2 0.077761 3.973464 2015-01-02 02:00:00 \n",
"3 0.076766 3.909235 2015-01-02 03:00:00 \n",
"4 0.077119 3.930702 2015-01-02 04:00:00 \n",
"\n",
"[5 rows x 54 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data.date = pd.to_datetime(data.date)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import datetime as dt"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_lookdays(x):\n",
" start = x - dt.timedelta(hours=24)\n",
" end = x - dt.timedelta(hours=1)\n",
" period = pd.date_range(start, end, freq='H')\n",
" return [dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S') for x in period.tolist()]\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"date_range = pd.date_range(data.date.min(), data.date.max(), freq='H')\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'CO']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out_cols = data.columns[1:7].tolist()\n",
"out_cols"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data['day'] = data.date.apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))\n",
"na_counts = data.set_index('day')[out_cols].isna().groupby('day').sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"drop_days = na_counts[na_counts.SO2>5].index.values"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"data = data.set_index('date').interpolate(method='linear')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"data = data.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"data = data[~data.date.isin(drop_days)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"data = data.set_index('date').reindex(date_range)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"data.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"data.rename(columns={'index':'date'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"data['lookdays'] = data.date.apply(get_lookdays)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"data['features'] = data.lookdays.apply(lambda x: data[data.date.isin(x)][out_cols].values.reshape(-1,).tolist())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"data['feature_len'] = data.features.apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"save_data = data[data.feature_len >=144].copy()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"pre_cols = list()\n",
"for i in range(24, 0, -1):\n",
" for j in out_cols:\n",
" pre_cols.append(f\"{i}_{j}\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"previous_out = save_data.features.apply(pd.Series, index=pre_cols)\n",
"previous_out['date'] = save_data.date.values"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>PM2.5</th>\n",
" <th>PM10</th>\n",
" <th>SO2</th>\n",
" <th>NO2</th>\n",
" <th>O3</th>\n",
" <th>CO</th>\n",
" <th>Ox</th>\n",
" <th>wind-U</th>\n",
" <th>wind-V</th>\n",
" <th>...</th>\n",
" <th>2_SO2</th>\n",
" <th>2_NO2</th>\n",
" <th>2_O3</th>\n",
" <th>2_CO</th>\n",
" <th>1_PM2.5</th>\n",
" <th>1_PM10</th>\n",
" <th>1_SO2</th>\n",
" <th>1_NO2</th>\n",
" <th>1_O3</th>\n",
" <th>1_CO</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-01-02 01:00:00</td>\n",
" <td>136.0</td>\n",
" <td>214.0</td>\n",
" <td>317.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.71</td>\n",
" <td>46.0</td>\n",
" <td>0.831775</td>\n",
" <td>-0.555113</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015-01-02 02:00:00</td>\n",
" <td>114.0</td>\n",
" <td>176.0</td>\n",
" <td>305.0</td>\n",
" <td>38.0</td>\n",
" <td>8.0</td>\n",
" <td>3.55</td>\n",
" <td>46.0</td>\n",
" <td>-0.695011</td>\n",
" <td>-0.083426</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2015-01-02 03:00:00</td>\n",
" <td>97.0</td>\n",
" <td>154.0</td>\n",
" <td>306.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.51</td>\n",
" <td>44.0</td>\n",
" <td>-0.173311</td>\n",
" <td>0.469003</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2015-01-02 04:00:00</td>\n",
" <td>87.0</td>\n",
" <td>141.0</td>\n",
" <td>316.0</td>\n",
" <td>38.0</td>\n",
" <td>7.0</td>\n",
" <td>3.55</td>\n",
" <td>45.0</td>\n",
" <td>0.000000</td>\n",
" <td>-0.200000</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2015-01-02 05:00:00</td>\n",
" <td>85.0</td>\n",
" <td>139.0</td>\n",
" <td>292.0</td>\n",
" <td>37.0</td>\n",
" <td>7.0</td>\n",
" <td>3.62</td>\n",
" <td>44.0</td>\n",
" <td>1.234518</td>\n",
" <td>0.660276</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52578</th>\n",
" <td>2020-12-31 19:00:00</td>\n",
" <td>27.0</td>\n",
" <td>51.0</td>\n",
" <td>16.0</td>\n",
" <td>46.0</td>\n",
" <td>29.0</td>\n",
" <td>0.72</td>\n",
" <td>75.0</td>\n",
" <td>1.067581</td>\n",
" <td>-0.265087</td>\n",
" <td>...</td>\n",
" <td>25.0</td>\n",
" <td>34.0</td>\n",
" <td>43.0</td>\n",
" <td>0.75</td>\n",
" <td>31.0</td>\n",
" <td>59.0</td>\n",
" <td>21.0</td>\n",
" <td>47.0</td>\n",
" <td>29.0</td>\n",
" <td>0.91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52579</th>\n",
" <td>2020-12-31 20:00:00</td>\n",
" <td>26.0</td>\n",
" <td>51.0</td>\n",
" <td>12.0</td>\n",
" <td>47.0</td>\n",
" <td>26.0</td>\n",
" <td>0.83</td>\n",
" <td>73.0</td>\n",
" <td>0.029164</td>\n",
" <td>0.298579</td>\n",
" <td>...</td>\n",
" <td>21.0</td>\n",
" <td>47.0</td>\n",
" <td>29.0</td>\n",
" <td>0.91</td>\n",
" <td>27.0</td>\n",
" <td>51.0</td>\n",
" <td>16.0</td>\n",
" <td>46.0</td>\n",
" <td>29.0</td>\n",
" <td>0.72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52580</th>\n",
" <td>2020-12-31 21:00:00</td>\n",
" <td>29.0</td>\n",
" <td>58.0</td>\n",
" <td>16.0</td>\n",
" <td>48.0</td>\n",
" <td>25.0</td>\n",
" <td>1.15</td>\n",
" <td>73.0</td>\n",
" <td>-0.079532</td>\n",
" <td>0.896479</td>\n",
" <td>...</td>\n",
" <td>16.0</td>\n",
" <td>46.0</td>\n",
" <td>29.0</td>\n",
" <td>0.72</td>\n",
" <td>26.0</td>\n",
" <td>51.0</td>\n",
" <td>12.0</td>\n",
" <td>47.0</td>\n",
" <td>26.0</td>\n",
" <td>0.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52581</th>\n",
" <td>2020-12-31 22:00:00</td>\n",
" <td>32.0</td>\n",
" <td>60.0</td>\n",
" <td>23.0</td>\n",
" <td>49.0</td>\n",
" <td>20.0</td>\n",
" <td>0.90</td>\n",
" <td>69.0</td>\n",
" <td>-1.660193</td>\n",
" <td>0.365729</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>47.0</td>\n",
" <td>26.0</td>\n",
" <td>0.83</td>\n",
" <td>29.0</td>\n",
" <td>58.0</td>\n",
" <td>16.0</td>\n",
" <td>48.0</td>\n",
" <td>25.0</td>\n",
" <td>1.15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52582</th>\n",
" <td>2020-12-31 23:00:00</td>\n",
" <td>53.0</td>\n",
" <td>94.0</td>\n",
" <td>41.0</td>\n",
" <td>57.0</td>\n",
" <td>12.0</td>\n",
" <td>1.18</td>\n",
" <td>69.0</td>\n",
" <td>-0.106042</td>\n",
" <td>1.195305</td>\n",
" <td>...</td>\n",
" <td>16.0</td>\n",
" <td>48.0</td>\n",
" <td>25.0</td>\n",
" <td>1.15</td>\n",
" <td>32.0</td>\n",
" <td>60.0</td>\n",
" <td>23.0</td>\n",
" <td>49.0</td>\n",
" <td>20.0</td>\n",
" <td>0.90</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>52583 rows × 200 columns</p>\n",
"</div>"
],
"text/plain": [
" date PM2.5 PM10 SO2 NO2 O3 CO Ox \\\n",
"0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 \n",
"1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 \n",
"2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 \n",
"3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 \n",
"4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 \n",
"... ... ... ... ... ... ... ... ... \n",
"52578 2020-12-31 19:00:00 27.0 51.0 16.0 46.0 29.0 0.72 75.0 \n",
"52579 2020-12-31 20:00:00 26.0 51.0 12.0 47.0 26.0 0.83 73.0 \n",
"52580 2020-12-31 21:00:00 29.0 58.0 16.0 48.0 25.0 1.15 73.0 \n",
"52581 2020-12-31 22:00:00 32.0 60.0 23.0 49.0 20.0 0.90 69.0 \n",
"52582 2020-12-31 23:00:00 53.0 94.0 41.0 57.0 12.0 1.18 69.0 \n",
"\n",
" wind-U wind-V ... 2_SO2 2_NO2 2_O3 2_CO 1_PM2.5 1_PM10 \\\n",
"0 0.831775 -0.555113 ... NaN NaN NaN NaN NaN NaN \n",
"1 -0.695011 -0.083426 ... NaN NaN NaN NaN NaN NaN \n",
"2 -0.173311 0.469003 ... NaN NaN NaN NaN NaN NaN \n",
"3 0.000000 -0.200000 ... NaN NaN NaN NaN NaN NaN \n",
"4 1.234518 0.660276 ... NaN NaN NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... ... ... ... \n",
"52578 1.067581 -0.265087 ... 25.0 34.0 43.0 0.75 31.0 59.0 \n",
"52579 0.029164 0.298579 ... 21.0 47.0 29.0 0.91 27.0 51.0 \n",
"52580 -0.079532 0.896479 ... 16.0 46.0 29.0 0.72 26.0 51.0 \n",
"52581 -1.660193 0.365729 ... 12.0 47.0 26.0 0.83 29.0 58.0 \n",
"52582 -0.106042 1.195305 ... 16.0 48.0 25.0 1.15 32.0 60.0 \n",
"\n",
" 1_SO2 1_NO2 1_O3 1_CO \n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"... ... ... ... ... \n",
"52578 21.0 47.0 29.0 0.91 \n",
"52579 16.0 46.0 29.0 0.72 \n",
"52580 12.0 47.0 26.0 0.83 \n",
"52581 16.0 48.0 25.0 1.15 \n",
"52582 23.0 49.0 20.0 0.90 \n",
"\n",
"[52583 rows x 200 columns]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop(columns=['features', 'feature_len']).merge(previous_out, on='date', how='left')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"new_data = previous_out.merge(data.drop(columns=['features', 'feature_len', 'lookdays', 'pre_time']))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"drop_cols = [x for x in new_data.columns if 'agricultural' in x] + ['NH3_power'] + ['CO_Bio', 'VOCs_Bio']\n",
"drop_cols.remove('NH3_agricultural')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"new_data.drop(columns=drop_cols, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"new_data.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"new_data.drop(columns=['day'], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(49014, 188)"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"new_data.set_index('date').to_csv('new_train_data.csv', encoding='utf-8-sig')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py37",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}