{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(52583, 54)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('./data/ori_data.csv')\n", "data.shape" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datePM2.5PM10SO2NO2O3COOxwind-Uwind-V...VOC_powerVOC_agriculturalPM2.5_industrialPM2.5_transportationPM2.5_resdientPM2.5_powerPM2.5_agriculturalCO_BioVOCs_Biopre_time
02015-01-02 01:00:00136.0214.0317.038.08.03.7146.00.831775-0.555113...0.0377240.00.9268510.0777150.8271100.4360280.00.0815464.2177062015-01-02 00:00:00
12015-01-02 02:00:00114.0176.0305.038.08.03.5546.0-0.695011-0.083426...0.0362150.00.9268510.0812480.8271100.4185870.00.0800314.1198072015-01-02 01:00:00
22015-01-02 03:00:0097.0154.0306.037.07.03.5144.0-0.1733110.469003...0.0357120.00.9268510.0883130.8271100.4127730.00.0777613.9734642015-01-02 02:00:00
32015-01-02 04:00:0087.0141.0316.038.07.03.5545.00.000000-0.200000...0.0367180.00.9268510.0912560.8271100.4244000.00.0767663.9092352015-01-02 03:00:00
42015-01-02 05:00:0085.0139.0292.037.07.03.6244.01.2345180.660276...0.0397360.00.9268510.0924341.7461210.4592820.00.0771193.9307022015-01-02 04:00:00
\n", "

5 rows × 54 columns

\n", "
" ], "text/plain": [ " date PM2.5 PM10 SO2 NO2 O3 CO Ox wind-U \\\n", "0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 0.831775 \n", "1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 -0.695011 \n", "2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 -0.173311 \n", "3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 0.000000 \n", "4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 1.234518 \n", "\n", " wind-V ... VOC_power VOC_agricultural PM2.5_industrial \\\n", "0 -0.555113 ... 0.037724 0.0 0.926851 \n", "1 -0.083426 ... 0.036215 0.0 0.926851 \n", "2 0.469003 ... 0.035712 0.0 0.926851 \n", "3 -0.200000 ... 0.036718 0.0 0.926851 \n", "4 0.660276 ... 0.039736 0.0 0.926851 \n", "\n", " PM2.5_transportation PM2.5_resdient PM2.5_power PM2.5_agricultural \\\n", "0 0.077715 0.827110 0.436028 0.0 \n", "1 0.081248 0.827110 0.418587 0.0 \n", "2 0.088313 0.827110 0.412773 0.0 \n", "3 0.091256 0.827110 0.424400 0.0 \n", "4 0.092434 1.746121 0.459282 0.0 \n", "\n", " CO_Bio VOCs_Bio pre_time \n", "0 0.081546 4.217706 2015-01-02 00:00:00 \n", "1 0.080031 4.119807 2015-01-02 01:00:00 \n", "2 0.077761 3.973464 2015-01-02 02:00:00 \n", "3 0.076766 3.909235 2015-01-02 03:00:00 \n", "4 0.077119 3.930702 2015-01-02 04:00:00 \n", "\n", "[5 rows x 54 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "data.date = pd.to_datetime(data.date)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import datetime as dt" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_lookdays(x):\n", " start = x - dt.timedelta(hours=24)\n", " end = x - dt.timedelta(hours=1)\n", " period = pd.date_range(start, end, freq='H')\n", " return [dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S') for x in period.tolist()]\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "date_range = pd.date_range(data.date.min(), data.date.max(), freq='H')\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['PM2.5', 'PM10', 'SO2', 'NO2', 'O3', 'CO']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out_cols = data.columns[1:7].tolist()\n", "out_cols" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data['day'] = data.date.apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))\n", "na_counts = data.set_index('day')[out_cols].isna().groupby('day').sum()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "drop_days = na_counts[na_counts.SO2>5].index.values" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "data = data.set_index('date').interpolate(method='linear')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "data = data.reset_index()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "data = data[~data.date.isin(drop_days)].copy()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "data = data.set_index('date').reindex(date_range)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "data.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "data.rename(columns={'index':'date'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "data['lookdays'] = data.date.apply(get_lookdays)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "data['features'] = data.lookdays.apply(lambda x: data[data.date.isin(x)][out_cols].values.reshape(-1,).tolist())" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "data['feature_len'] = data.features.apply(lambda x: len(x))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "save_data = data[data.feature_len >=144].copy()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "pre_cols = list()\n", "for i in range(24, 0, -1):\n", " for j in out_cols:\n", " pre_cols.append(f\"{i}_{j}\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "previous_out = save_data.features.apply(pd.Series, index=pre_cols)\n", "previous_out['date'] = save_data.date.values" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datePM2.5PM10SO2NO2O3COOxwind-Uwind-V...2_SO22_NO22_O32_CO1_PM2.51_PM101_SO21_NO21_O31_CO
02015-01-02 01:00:00136.0214.0317.038.08.03.7146.00.831775-0.555113...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12015-01-02 02:00:00114.0176.0305.038.08.03.5546.0-0.695011-0.083426...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22015-01-02 03:00:0097.0154.0306.037.07.03.5144.0-0.1733110.469003...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
32015-01-02 04:00:0087.0141.0316.038.07.03.5545.00.000000-0.200000...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42015-01-02 05:00:0085.0139.0292.037.07.03.6244.01.2345180.660276...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
525782020-12-31 19:00:0027.051.016.046.029.00.7275.01.067581-0.265087...25.034.043.00.7531.059.021.047.029.00.91
525792020-12-31 20:00:0026.051.012.047.026.00.8373.00.0291640.298579...21.047.029.00.9127.051.016.046.029.00.72
525802020-12-31 21:00:0029.058.016.048.025.01.1573.0-0.0795320.896479...16.046.029.00.7226.051.012.047.026.00.83
525812020-12-31 22:00:0032.060.023.049.020.00.9069.0-1.6601930.365729...12.047.026.00.8329.058.016.048.025.01.15
525822020-12-31 23:00:0053.094.041.057.012.01.1869.0-0.1060421.195305...16.048.025.01.1532.060.023.049.020.00.90
\n", "

52583 rows × 200 columns

\n", "
" ], "text/plain": [ " date PM2.5 PM10 SO2 NO2 O3 CO Ox \\\n", "0 2015-01-02 01:00:00 136.0 214.0 317.0 38.0 8.0 3.71 46.0 \n", "1 2015-01-02 02:00:00 114.0 176.0 305.0 38.0 8.0 3.55 46.0 \n", "2 2015-01-02 03:00:00 97.0 154.0 306.0 37.0 7.0 3.51 44.0 \n", "3 2015-01-02 04:00:00 87.0 141.0 316.0 38.0 7.0 3.55 45.0 \n", "4 2015-01-02 05:00:00 85.0 139.0 292.0 37.0 7.0 3.62 44.0 \n", "... ... ... ... ... ... ... ... ... \n", "52578 2020-12-31 19:00:00 27.0 51.0 16.0 46.0 29.0 0.72 75.0 \n", "52579 2020-12-31 20:00:00 26.0 51.0 12.0 47.0 26.0 0.83 73.0 \n", "52580 2020-12-31 21:00:00 29.0 58.0 16.0 48.0 25.0 1.15 73.0 \n", "52581 2020-12-31 22:00:00 32.0 60.0 23.0 49.0 20.0 0.90 69.0 \n", "52582 2020-12-31 23:00:00 53.0 94.0 41.0 57.0 12.0 1.18 69.0 \n", "\n", " wind-U wind-V ... 2_SO2 2_NO2 2_O3 2_CO 1_PM2.5 1_PM10 \\\n", "0 0.831775 -0.555113 ... NaN NaN NaN NaN NaN NaN \n", "1 -0.695011 -0.083426 ... NaN NaN NaN NaN NaN NaN \n", "2 -0.173311 0.469003 ... NaN NaN NaN NaN NaN NaN \n", "3 0.000000 -0.200000 ... NaN NaN NaN NaN NaN NaN \n", "4 1.234518 0.660276 ... NaN NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... ... ... ... ... \n", "52578 1.067581 -0.265087 ... 25.0 34.0 43.0 0.75 31.0 59.0 \n", "52579 0.029164 0.298579 ... 21.0 47.0 29.0 0.91 27.0 51.0 \n", "52580 -0.079532 0.896479 ... 16.0 46.0 29.0 0.72 26.0 51.0 \n", "52581 -1.660193 0.365729 ... 12.0 47.0 26.0 0.83 29.0 58.0 \n", "52582 -0.106042 1.195305 ... 16.0 48.0 25.0 1.15 32.0 60.0 \n", "\n", " 1_SO2 1_NO2 1_O3 1_CO \n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "... ... ... ... ... \n", "52578 21.0 47.0 29.0 0.91 \n", "52579 16.0 46.0 29.0 0.72 \n", "52580 12.0 47.0 26.0 0.83 \n", "52581 16.0 48.0 25.0 1.15 \n", "52582 23.0 49.0 20.0 0.90 \n", "\n", "[52583 rows x 200 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.drop(columns=['features', 'feature_len']).merge(previous_out, on='date', how='left')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "new_data = previous_out.merge(data.drop(columns=['features', 'feature_len', 'lookdays', 'pre_time']))" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "drop_cols = [x for x in new_data.columns if 'agricultural' in x] + ['NH3_power'] + ['CO_Bio', 'VOCs_Bio']\n", "drop_cols.remove('NH3_agricultural')" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "new_data.drop(columns=drop_cols, inplace=True)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "new_data.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "new_data.drop(columns=['day'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(49014, 188)" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_data.shape" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "new_data.set_index('date').to_csv('new_train_data.csv', encoding='utf-8-sig')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "py37", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }