22-T67/时间插值.ipynb

27 KiB
Raw Permalink Blame History

In [1]:
import pandas as pd
import numpy as np
In [2]:
data = pd.read_csv('./data/ori_data.csv')
data.shape
Out[2]:
(52583, 54)
In [3]:
data.head()
Out[3]:
date PM2.5 PM10 SO2 NO2 O3 CO Ox wind-U wind-V ... VOC_power VOC_agricultural PM2.5_industrial PM2.5_transportation PM2.5_resdient PM2.5_power PM2.5_agricultural CO_Bio VOCs_Bio pre_time
0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 0.831775 -0.555113 ... 0.037724 0.0 0.926851 0.077715 0.827110 0.436028 0.0 0.081546 4.217706 2015-01-02 00:00:00
1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 -0.695011 -0.083426 ... 0.036215 0.0 0.926851 0.081248 0.827110 0.418587 0.0 0.080031 4.119807 2015-01-02 01:00:00
2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 -0.173311 0.469003 ... 0.035712 0.0 0.926851 0.088313 0.827110 0.412773 0.0 0.077761 3.973464 2015-01-02 02:00:00
3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 0.000000 -0.200000 ... 0.036718 0.0 0.926851 0.091256 0.827110 0.424400 0.0 0.076766 3.909235 2015-01-02 03:00:00
4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 1.234518 0.660276 ... 0.039736 0.0 0.926851 0.092434 1.746121 0.459282 0.0 0.077119 3.930702 2015-01-02 04:00:00

5 rows × 54 columns

In [4]:
data.date = pd.to_datetime(data.date)
In [5]:
import datetime as dt
In [6]:
def get_lookdays(x):
    start = x - dt.timedelta(hours=24)
    end = x - dt.timedelta(hours=1)
    period = pd.date_range(start, end, freq='H')
    return [dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S') for x in period.tolist()]
In [8]:
date_range = pd.date_range(data.date.min(), data.date.max(), freq='H')
In [9]:
out_cols = data.columns[1:7].tolist()
out_cols
Out[9]:
['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'CO']
In [10]:
data['day'] = data.date.apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))
na_counts = data.set_index('day')[out_cols].isna().groupby('day').sum()
In [11]:
drop_days = na_counts[na_counts.SO2>5].index.values
In [12]:
data = data.set_index('date').interpolate(method='linear')
In [13]:
data = data.reset_index()
In [14]:
data = data[~data.date.isin(drop_days)].copy()
In [16]:
data = data.set_index('date').reindex(date_range)
In [19]:
data.reset_index(inplace=True)
In [22]:
data.rename(columns={'index':'date'}, inplace=True)
In [25]:
data['lookdays'] = data.date.apply(get_lookdays)
In [26]:
data['features'] = data.lookdays.apply(lambda x: data[data.date.isin(x)][out_cols].values.reshape(-1,).tolist())
In [27]:
data['feature_len'] = data.features.apply(lambda x: len(x))
In [28]:
save_data = data[data.feature_len >=144].copy()
In [29]:
pre_cols = list()
for i in range(24, 0, -1):
    for j in out_cols:
        pre_cols.append(f"{i}_{j}")
In [30]:
previous_out = save_data.features.apply(pd.Series, index=pre_cols)
previous_out['date'] = save_data.date.values
In [31]:
data.drop(columns=['features', 'feature_len']).merge(previous_out, on='date', how='left')
Out[31]:
date PM2.5 PM10 SO2 NO2 O3 CO Ox wind-U wind-V ... 2_SO2 2_NO2 2_O3 2_CO 1_PM2.5 1_PM10 1_SO2 1_NO2 1_O3 1_CO
0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 0.831775 -0.555113 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 -0.695011 -0.083426 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 -0.173311 0.469003 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 0.000000 -0.200000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 1.234518 0.660276 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52578 2020-12-31 19:00:00 27.0 51.0 16.0 46.0 29.0 0.72 75.0 1.067581 -0.265087 ... 25.0 34.0 43.0 0.75 31.0 59.0 21.0 47.0 29.0 0.91
52579 2020-12-31 20:00:00 26.0 51.0 12.0 47.0 26.0 0.83 73.0 0.029164 0.298579 ... 21.0 47.0 29.0 0.91 27.0 51.0 16.0 46.0 29.0 0.72
52580 2020-12-31 21:00:00 29.0 58.0 16.0 48.0 25.0 1.15 73.0 -0.079532 0.896479 ... 16.0 46.0 29.0 0.72 26.0 51.0 12.0 47.0 26.0 0.83
52581 2020-12-31 22:00:00 32.0 60.0 23.0 49.0 20.0 0.90 69.0 -1.660193 0.365729 ... 12.0 47.0 26.0 0.83 29.0 58.0 16.0 48.0 25.0 1.15
52582 2020-12-31 23:00:00 53.0 94.0 41.0 57.0 12.0 1.18 69.0 -0.106042 1.195305 ... 16.0 48.0 25.0 1.15 32.0 60.0 23.0 49.0 20.0 0.90

52583 rows × 200 columns

In [34]:
new_data = previous_out.merge(data.drop(columns=['features', 'feature_len', 'lookdays', 'pre_time']))
In [42]:
drop_cols = [x for x in new_data.columns if 'agricultural' in x] + ['NH3_power'] + ['CO_Bio', 'VOCs_Bio']
drop_cols.remove('NH3_agricultural')
In [48]:
new_data.drop(columns=drop_cols, inplace=True)
In [44]:
new_data.dropna(inplace=True)
In [45]:
new_data.drop(columns=['day'], inplace=True)
In [49]:
new_data.shape
Out[49]:
(49014, 188)
In [51]:
new_data.set_index('date').to_csv('new_train_data.csv', encoding='utf-8-sig')
In [ ]: