27 KiB
27 KiB
In [1]:
import pandas as pd import numpy as np
In [2]:
data = pd.read_csv('./data/ori_data.csv') data.shape
Out[2]:
(52583, 54)
In [3]:
data.head()
Out[3]:
date | PM2.5 | PM10 | SO2 | NO2 | O3 | CO | Ox | wind-U | wind-V | ... | VOC_power | VOC_agricultural | PM2.5_industrial | PM2.5_transportation | PM2.5_resdient | PM2.5_power | PM2.5_agricultural | CO_Bio | VOCs_Bio | pre_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-02 01:00:00 | 136.0 | 214.0 | 317.0 | 38.0 | 8.0 | 3.71 | 46.0 | 0.831775 | -0.555113 | ... | 0.037724 | 0.0 | 0.926851 | 0.077715 | 0.827110 | 0.436028 | 0.0 | 0.081546 | 4.217706 | 2015-01-02 00:00:00 |
1 | 2015-01-02 02:00:00 | 114.0 | 176.0 | 305.0 | 38.0 | 8.0 | 3.55 | 46.0 | -0.695011 | -0.083426 | ... | 0.036215 | 0.0 | 0.926851 | 0.081248 | 0.827110 | 0.418587 | 0.0 | 0.080031 | 4.119807 | 2015-01-02 01:00:00 |
2 | 2015-01-02 03:00:00 | 97.0 | 154.0 | 306.0 | 37.0 | 7.0 | 3.51 | 44.0 | -0.173311 | 0.469003 | ... | 0.035712 | 0.0 | 0.926851 | 0.088313 | 0.827110 | 0.412773 | 0.0 | 0.077761 | 3.973464 | 2015-01-02 02:00:00 |
3 | 2015-01-02 04:00:00 | 87.0 | 141.0 | 316.0 | 38.0 | 7.0 | 3.55 | 45.0 | 0.000000 | -0.200000 | ... | 0.036718 | 0.0 | 0.926851 | 0.091256 | 0.827110 | 0.424400 | 0.0 | 0.076766 | 3.909235 | 2015-01-02 03:00:00 |
4 | 2015-01-02 05:00:00 | 85.0 | 139.0 | 292.0 | 37.0 | 7.0 | 3.62 | 44.0 | 1.234518 | 0.660276 | ... | 0.039736 | 0.0 | 0.926851 | 0.092434 | 1.746121 | 0.459282 | 0.0 | 0.077119 | 3.930702 | 2015-01-02 04:00:00 |
5 rows × 54 columns
In [4]:
data.date = pd.to_datetime(data.date)
In [5]:
import datetime as dt
In [6]:
def get_lookdays(x): start = x - dt.timedelta(hours=24) end = x - dt.timedelta(hours=1) period = pd.date_range(start, end, freq='H') return [dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S') for x in period.tolist()]
In [8]:
date_range = pd.date_range(data.date.min(), data.date.max(), freq='H')
In [9]:
out_cols = data.columns[1:7].tolist() out_cols
Out[9]:
['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'CO']
In [10]:
data['day'] = data.date.apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d')) na_counts = data.set_index('day')[out_cols].isna().groupby('day').sum()
In [11]:
drop_days = na_counts[na_counts.SO2>5].index.values
In [12]:
data = data.set_index('date').interpolate(method='linear')
In [13]:
data = data.reset_index()
In [14]:
data = data[~data.date.isin(drop_days)].copy()
In [16]:
data = data.set_index('date').reindex(date_range)
In [19]:
data.reset_index(inplace=True)
In [22]:
data.rename(columns={'index':'date'}, inplace=True)
In [25]:
data['lookdays'] = data.date.apply(get_lookdays)
In [26]:
data['features'] = data.lookdays.apply(lambda x: data[data.date.isin(x)][out_cols].values.reshape(-1,).tolist())
In [27]:
data['feature_len'] = data.features.apply(lambda x: len(x))
In [28]:
save_data = data[data.feature_len >=144].copy()
In [29]:
pre_cols = list() for i in range(24, 0, -1): for j in out_cols: pre_cols.append(f"{i}_{j}")
In [30]:
previous_out = save_data.features.apply(pd.Series, index=pre_cols) previous_out['date'] = save_data.date.values
In [31]:
data.drop(columns=['features', 'feature_len']).merge(previous_out, on='date', how='left')
Out[31]:
date | PM2.5 | PM10 | SO2 | NO2 | O3 | CO | Ox | wind-U | wind-V | ... | 2_SO2 | 2_NO2 | 2_O3 | 2_CO | 1_PM2.5 | 1_PM10 | 1_SO2 | 1_NO2 | 1_O3 | 1_CO | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-02 01:00:00 | 136.0 | 214.0 | 317.0 | 38.0 | 8.0 | 3.71 | 46.0 | 0.831775 | -0.555113 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2015-01-02 02:00:00 | 114.0 | 176.0 | 305.0 | 38.0 | 8.0 | 3.55 | 46.0 | -0.695011 | -0.083426 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2015-01-02 03:00:00 | 97.0 | 154.0 | 306.0 | 37.0 | 7.0 | 3.51 | 44.0 | -0.173311 | 0.469003 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2015-01-02 04:00:00 | 87.0 | 141.0 | 316.0 | 38.0 | 7.0 | 3.55 | 45.0 | 0.000000 | -0.200000 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2015-01-02 05:00:00 | 85.0 | 139.0 | 292.0 | 37.0 | 7.0 | 3.62 | 44.0 | 1.234518 | 0.660276 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
52578 | 2020-12-31 19:00:00 | 27.0 | 51.0 | 16.0 | 46.0 | 29.0 | 0.72 | 75.0 | 1.067581 | -0.265087 | ... | 25.0 | 34.0 | 43.0 | 0.75 | 31.0 | 59.0 | 21.0 | 47.0 | 29.0 | 0.91 |
52579 | 2020-12-31 20:00:00 | 26.0 | 51.0 | 12.0 | 47.0 | 26.0 | 0.83 | 73.0 | 0.029164 | 0.298579 | ... | 21.0 | 47.0 | 29.0 | 0.91 | 27.0 | 51.0 | 16.0 | 46.0 | 29.0 | 0.72 |
52580 | 2020-12-31 21:00:00 | 29.0 | 58.0 | 16.0 | 48.0 | 25.0 | 1.15 | 73.0 | -0.079532 | 0.896479 | ... | 16.0 | 46.0 | 29.0 | 0.72 | 26.0 | 51.0 | 12.0 | 47.0 | 26.0 | 0.83 |
52581 | 2020-12-31 22:00:00 | 32.0 | 60.0 | 23.0 | 49.0 | 20.0 | 0.90 | 69.0 | -1.660193 | 0.365729 | ... | 12.0 | 47.0 | 26.0 | 0.83 | 29.0 | 58.0 | 16.0 | 48.0 | 25.0 | 1.15 |
52582 | 2020-12-31 23:00:00 | 53.0 | 94.0 | 41.0 | 57.0 | 12.0 | 1.18 | 69.0 | -0.106042 | 1.195305 | ... | 16.0 | 48.0 | 25.0 | 1.15 | 32.0 | 60.0 | 23.0 | 49.0 | 20.0 | 0.90 |
52583 rows × 200 columns
In [34]:
new_data = previous_out.merge(data.drop(columns=['features', 'feature_len', 'lookdays', 'pre_time']))
In [42]:
drop_cols = [x for x in new_data.columns if 'agricultural' in x] + ['NH3_power'] + ['CO_Bio', 'VOCs_Bio'] drop_cols.remove('NH3_agricultural')
In [48]:
new_data.drop(columns=drop_cols, inplace=True)
In [44]:
new_data.dropna(inplace=True)
In [45]:
new_data.drop(columns=['day'], inplace=True)
In [49]:
new_data.shape
Out[49]:
(49014, 188)
In [51]:
new_data.set_index('date').to_csv('new_train_data.csv', encoding='utf-8-sig')
In [ ]: