emission_detect_ai/data_analysis.ipynb

1113 lines
520 KiB
Plaintext
Raw Permalink Normal View History

2022-10-25 15:11:12 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import matplotlib.pyplot as plt\n",
"#新增加的两行\n",
"from pylab import mpl\n",
"# 设置显示中文字体\n",
"mpl.rcParams[\"font.sans-serif\"] = [\"SimHei\"]\n",
"\n",
"mpl.rcParams[\"axes.unicode_minus\"] = False"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": " 日期 企业名称 地址 省份 经度 纬度 烟囱高度m \\\n0 2018-10-01 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 \n1 2018-10-02 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 \n2 2018-10-03 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 \n3 2018-10-04 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 \n4 2018-10-05 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 \n\n 脱硝工艺 脱硝剂名称 脱硝设备数量 ... 供热量(吉焦) 产渣量(吨) 机组运行时间(小时) 硫分(% 脱硫副产品产量(吨) \\\n0 SNCR SCR 氨水 3 ... 6536.83 NaN 24.0 0.51 NaN \n1 SNCR SCR 氨水 3 ... 2484.64 NaN 24.0 0.51 NaN \n2 SNCR SCR 氨水 3 ... 3020.83 NaN 24.0 0.51 NaN \n3 SNCR SCR 氨水 3 ... 5599.23 NaN 24.0 0.51 72.52 \n4 SNCR SCR 氨水 3 ... 4702.65 NaN 24.0 0.51 NaN \n\n 脱硫剂使用量(吨) 脱硫设施运行时间(小时) 脱硝还原剂消耗量(吨) 脱硝运行时间(小时) 燃料消耗量(吨) \n0 5.06 24.0 2.98 24.0 323 \n1 5.04 24.0 2.97 24.0 218 \n2 5.04 24.0 2.95 24.0 212 \n3 5.03 24.0 2.98 24.0 223 \n4 5.06 24.0 3.01 24.0 243 \n\n[5 rows x 44 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>日期</th>\n <th>企业名称</th>\n <th>地址</th>\n <th>省份</th>\n <th>经度</th>\n <th>纬度</th>\n <th>烟囱高度m</th>\n <th>脱硝工艺</th>\n <th>脱硝剂名称</th>\n <th>脱硝设备数量</th>\n <th>...</th>\n <th>供热量(吉焦)</th>\n <th>产渣量(吨)</th>\n <th>机组运行时间(小时)</th>\n <th>硫分(%</th>\n <th>脱硫副产品产量(吨)</th>\n <th>脱硫剂使用量(吨)</th>\n <th>脱硫设施运行时间(小时)</th>\n <th>脱硝还原剂消耗量(吨)</th>\n <th>脱硝运行时间(小时)</th>\n <th>燃料消耗量(吨)</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>2018-10-01</td>\n <td>浙江秀舟热电有限公司</td>\n <td>嘉兴市南湖区凤桥镇</td>\n <td>浙江省</td>\n <td>120°515.54″</td>\n <td>30°3914.76″</td>\n <td>80</td>\n <td>SNCR SCR</td>\n <td>氨水</td>\n <td>3</td>\n <td>...</td>\n <td>6536.83</td>\n <td>NaN</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>NaN</td>\n <td>5.06</td>\n <td>24.0</td>\n <td>2.98</td>\n <td>24.0</td>\n <td>323</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2018-10-02</td>\n <td>浙江秀舟热电有限公司</td>\n <td>嘉兴市南湖区凤桥镇</td>\n <td>浙江省</td>\n <td>120°515.54″</td>\n <td>30°3914.76″</td>\n <td>80</td>\n <td>SNCR SCR</td>\n <td>氨水</td>\n <td>3</td>\n <td>...</td>\n <td>2484.64</td>\n <td>NaN</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>NaN</td>\n <td>5.04</td>\n <td>24.0</td>\n <td>2.97</td>\n <td>24.0</td>\n <td>218</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2018-10-03</td>\n <td>浙江秀舟热电有限公司</td>\n <td>嘉兴市南湖区凤桥镇</td>\n <td>浙江省</td>\n <td>120°515.54″</td>\n <td>30°3914.76″</td>\n <td>80</td>\n <td>SNCR SCR</td>\n <td>氨水</td>\n <td>3</td>\n <td>...</td>\n <td>3020.83</td>\n <td>NaN</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>NaN</td>\n <td>5.04</td>\n <td>24.0</td>\n <td>2.95</td>\n <td>24.0</td>\n <td>212</td>\n </tr>\n <tr>\n <th>3</th>\n <td>2018-10-04</td>\n <td>浙江秀舟热电有限公司</td>\n <td>嘉兴市南湖区凤桥镇</td>\n <td>浙江省</td>\n <td>120°515.54″</td>\n <td>30°3914.76″</td>\n <td>80</td>\n <td>SNCR SCR</td>\n <td>氨水</td>\n <td>3</td>\n <td>...</td>\n <td>5599.23</td>\n <td>NaN</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>72.52</td>\n <td>5.03</td>\n <td>24.0</td>\n <td>2.98</td>\n <td>24.0</td>\n <td>223</td>\n </tr>\n <tr>\n <th>4</th>\n <td>2018-10-05</td>\n <td>浙江秀舟热电有限公司</td>\n <td>嘉兴市南湖区凤桥镇</td>\n <td>浙江省</td>\n <td>120°515.54″</td>\n <td>30°3914.76″</td>\n <td>80</td>\n <td>SNCR SCR</td>\n <td>氨水</td>\n <td>3</td>\n <td>...</td>\n <td>4702.65</td>\n <td>NaN</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>NaN</td>\n <td>5.06</td>\n <td>24.0</td>\n <td>3.01</td>\n <td>24.0</td>\n <td>243</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 44 columns</p>\n</div>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"daily_data = pd.read_excel('./data/机器学习样表.xlsx',sheet_name=0, header=[0, 1])\n",
"old_cols = daily_data.columns\n",
"new_cols = [x[0].strip() if 'Unnamed' in x[1] else x[0]+'_'+x[1] for x in old_cols]\n",
"daily_data.columns = new_cols\n",
"daily_data.head()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"daily_data.rename(columns={\"日期\": \"days\"}, inplace=True)\n",
"daily_data.days = daily_data.days.astype(str)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": "(1178, 44)"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"daily_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": " date flow rNOx rO2 temp rSO2 rsmoke\n0 2018-03-23 00:00:00 244136.19 28.6370 7.700 51.400 0.8900 1.2000\n1 2018-03-23 01:00:00 234599.89 29.9710 7.800 51.300 0.7600 1.1700\n2 2018-03-23 02:00:00 249264.88 20.9960 7.300 54.900 2.1800 1.3600\n3 2018-03-23 03:00:00 229360.17 24.3590 7.500 52.700 1.9600 1.3500\n4 2018-03-23 04:00:00 236416.45 18.3680 7.200 55.100 1.6500 1.3500\n... ... ... ... ... ... ... ...\n33714 2022-01-26 19:00:00 255639.10 2.1000 15.719 36.720 1.7939 1.0533\n33715 2022-01-26 20:00:00 253412.80 1.6378 15.580 36.812 1.7928 1.0543\n33716 2022-01-26 21:00:00 261648.90 2.0940 15.595 36.948 1.8048 1.0547\n33717 2022-01-26 22:00:00 271429.70 1.9489 15.532 37.160 1.7887 1.0566\n33718 2022-01-26 23:00:00 272750.00 1.5552 15.435 37.279 1.7655 1.0570\n\n[33715 rows x 7 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>date</th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>2018-03-23 00:00:00</td>\n <td>244136.19</td>\n <td>28.6370</td>\n <td>7.700</td>\n <td>51.400</td>\n <td>0.8900</td>\n <td>1.2000</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2018-03-23 01:00:00</td>\n <td>234599.89</td>\n <td>29.9710</td>\n <td>7.800</td>\n <td>51.300</td>\n <td>0.7600</td>\n <td>1.1700</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2018-03-23 02:00:00</td>\n <td>249264.88</td>\n <td>20.9960</td>\n <td>7.300</td>\n <td>54.900</td>\n <td>2.1800</td>\n <td>1.3600</td>\n </tr>\n <tr>\n <th>3</th>\n <td>2018-03-23 03:00:00</td>\n <td>229360.17</td>\n <td>24.3590</td>\n <td>7.500</td>\n <td>52.700</td>\n <td>1.9600</td>\n <td>1.3500</td>\n </tr>\n <tr>\n <th>4</th>\n <td>2018-03-23 04:00:00</td>\n <td>236416.45</td>\n <td>18.3680</td>\n <td>7.200</td>\n <td>55.100</td>\n <td>1.6500</td>\n <td>1.3500</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>33714</th>\n <td>2022-01-26 19:00:00</td>\n <td>255639.10</td>\n <td>2.1000</td>\n <td>15.719</td>\n <td>36.720</td>\n <td>1.7939</td>\n <td>1.0533</td>\n </tr>\n <tr>\n <th>33715</th>\n <td>2022-01-26 20:00:00</td>\n <td>253412.80</td>\n <td>1.6378</td>\n <td>15.580</td>\n <td>36.812</td>\n <td>1.7928</td>\n <td>1.0543</td>\n </tr>\n <tr>\n <th>33716</th>\n <td>2022-01-26 21:00:00</td>\n <td>261648.90</td>\n <td>2.0940</td>\n <td>15.595</td>\n <td>36.948</td>\n <td>1.8048</td>\n <td>1.0547</td>\n </tr>\n <tr>\n <th>33717</th>\n <td>2022-01-26 22:00:00</td>\n <td>271429.70</td>\n <td>1.9489</td>\n <td>15.532</td>\n <td>37.160</td>\n <td>1.7887</td>\n <td>1.0566</td>\n </tr>\n <tr>\n <th>33718</th>\n <td>2022-01-26 23:00:00</td>\n <td>272750.00</td>\n <td>1.5552</td>\n <td>15.435</td>\n <td>37.279</td>\n <td>1.7655</td>\n <td>1.0570</td>\n </tr>\n </tbody>\n</table>\n<p>33715 rows × 7 columns</p>\n</div>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data = pd.read_excel('./data/机器学习样表.xlsx',sheet_name=1).drop_duplicates()\n",
"hourly_data.columns = ['date', 'flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke']\n",
"hourly_data.date = hourly_data.date.astype(\"datetime64\")\n",
"ori_hourly_data = hourly_data.copy()\n",
"hourly_data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"hourly_data['rNOx'] = hourly_data.apply((lambda x: x['rNOx'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)\n",
"hourly_data['rSO2'] = hourly_data.apply((lambda x: x['rSO2'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)\n",
"hourly_data['rsmoke'] = hourly_data.apply((lambda x: x['rsmoke'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"我们将每天24个小时的数据作为特征因此对数据不足24小时的要先填充Nan以备后续处理"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": "(33744, 33715)"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hour_range = pd.date_range(hourly_data.date.min(), hourly_data.date.max(), freq='H')\n",
"hour_range.shape[0], hourly_data.shape[0]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"可见少了约30条数据因此进行index对齐"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"hourly_data = hourly_data.set_index(\"date\").reindex(hour_range)\n",
"hourly_data['days'] = hourly_data.index.astype(str).to_series().apply(lambda x:x.split(' ')[0]).values"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 异常值处理\n",
"对于出现的负值,若一天之中出现的较多,这一天的数据视为脏数据,可以统一处理"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "flow 49432.59\nrNOx -0.338727\nrO2 19.1\ntemp 34.7\nrSO2 0.124794\nrsmoke 0.490263\ndays 2020-12-01\nName: 2020-12-01 12:00:00, dtype: object"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data[hourly_data.rNOx < 0].iloc[0]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": " flow rNOx rO2 temp rSO2 rsmoke \\\n2020-12-01 12:00:00 49432.59 NaN 19.1 34.7 0.124794 0.490263 \n\n days \n2020-12-01 12:00:00 2020-12-01 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n <th>days</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2020-12-01 12:00:00</th>\n <td>49432.59</td>\n <td>NaN</td>\n <td>19.1</td>\n <td>34.7</td>\n <td>0.124794</td>\n <td>0.490263</td>\n <td>2020-12-01</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data[hourly_data._get_numeric_data() < 0] = np.nan\n",
"hourly_data[hourly_data.index=='2020-12-01 12:00:00']"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 缺失值分析"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "['flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke']"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 特征列\n",
"num_cols = [x for x in hourly_data.columns if not x.startswith('da')]\n",
"num_cols"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# 写一个逻辑首先统计出每天有缺失数据的比例对任一数据缺失记录高于4条的判断该天在daily_data是否有生产记录若无则为脏数据需要删去\n",
"del_date = list()\n",
"for col in num_cols:\n",
" na_counts = hourly_data[hourly_data[col].isna()].days.value_counts().to_dict()\n",
" for date in na_counts:\n",
" if na_counts.get(date) < 4:\n",
" continue\n",
" try:\n",
" if date in del_date:\n",
" continue\n",
" if daily_data[daily_data.days==date].shape[0] == 0:\n",
" del_date.append(date)\n",
" continue\n",
" if daily_data[daily_data.days==date]['发电量(千瓦时)'].values[0] > 100000 or daily_data[daily_data.days==date]['供热量(吉焦)'].values[0] > 2500:\n",
" # 取缺失率高且有较大发电量的天作为删除值\n",
" del_date.append(date)\n",
" except:\n",
" print(date)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "101"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(del_date)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": "((31320, 7), (1108, 44))"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 删掉不要的脏数据\n",
"hourly_data = hourly_data[~hourly_data.days.isin(del_date)].copy()\n",
"daily_data = daily_data[~daily_data.days.isin(del_date)].copy()\n",
"hourly_data.shape, daily_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"最后看一下有无哪一天仍有很多缺失数据"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": "2019-04-08 22\n2019-07-17 3\n2018-12-19 3\n2021-02-18 3\n2019-01-16 3\n ..\n2019-03-12 1\n2019-03-17 1\n2019-03-27 1\n2019-04-04 1\n2019-05-24 1\nName: days, Length: 220, dtype: int64"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data[hourly_data[num_cols].isnull().T.any()].days.value_counts()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"于是再去掉2019-04-08的数据"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [
{
"data": {
"text/plain": "(31296, 7)"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data = hourly_data[hourly_data.days!='2019-04-08'].copy()\n",
"hourly_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 缺失值补充\n",
"1. 对Nan值 取其上下两个时刻的均值作为填充值\n",
"2. 对于多个连续缺失的值,实际上应该用窗口法填充,但是难度太大,因此仍用均值填充\n",
"3. 使用ffill和bfill然后合并取均值。"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": " flow rNOx rO2 temp rSO2 rsmoke \\\n2018-03-23 00:00:00 244136.19 24.212548 7.700 51.400 0.752494 1.014598 \n2018-03-23 01:00:00 234599.89 25.348257 7.800 51.300 0.642777 0.989539 \n2018-03-23 02:00:00 249264.88 17.562606 7.300 54.900 1.823513 1.137605 \n2018-03-23 03:00:00 229360.17 20.513299 7.500 52.700 1.650563 1.136867 \n2018-03-23 04:00:00 236416.45 15.354987 7.200 55.100 1.379341 1.128551 \n... ... ... ... ... ... ... \n2022-01-26 19:00:00 255639.10 1.859704 15.719 36.720 1.588630 0.932774 \n2022-01-26 20:00:00 253412.80 1.449961 15.580 36.812 1.587185 0.933383 \n2022-01-26 21:00:00 261648.90 1.853027 15.595 36.948 1.597107 0.933327 \n2022-01-26 22:00:00 271429.70 1.723446 15.532 37.160 1.581778 0.934369 \n2022-01-26 23:00:00 272750.00 1.374762 15.435 37.279 1.560663 0.934365 \n\n days \n2018-03-23 00:00:00 2018-03-23 \n2018-03-23 01:00:00 2018-03-23 \n2018-03-23 02:00:00 2018-03-23 \n2018-03-23 03:00:00 2018-03-23 \n2018-03-23 04:00:00 2018-03-23 \n... ... \n2022-01-26 19:00:00 2022-01-26 \n2022-01-26 20:00:00 2022-01-26 \n2022-01-26 21:00:00 2022-01-26 \n2022-01-26 22:00:00 2022-01-26 \n2022-01-26 23:00:00 2022-01-26 \n\n[31296 rows x 7 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n <th>days</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2018-03-23 00:00:00</th>\n <td>244136.19</td>\n <td>24.212548</td>\n <td>7.700</td>\n <td>51.400</td>\n <td>0.752494</td>\n <td>1.014598</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 01:00:00</th>\n <td>234599.89</td>\n <td>25.348257</td>\n <td>7.800</td>\n <td>51.300</td>\n <td>0.642777</td>\n <td>0.989539</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 02:00:00</th>\n <td>249264.88</td>\n <td>17.562606</td>\n <td>7.300</td>\n <td>54.900</td>\n <td>1.823513</td>\n <td>1.137605</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 03:00:00</th>\n <td>229360.17</td>\n <td>20.513299</td>\n <td>7.500</td>\n <td>52.700</td>\n <td>1.650563</td>\n <td>1.136867</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 04:00:00</th>\n <td>236416.45</td>\n <td>15.354987</td>\n <td>7.200</td>\n <td>55.100</td>\n <td>1.379341</td>\n <td>1.128551</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>2022-01-26 19:00:00</th>\n <td>255639.10</td>\n <td>1.859704</td>\n <td>15.719</td>\n <td>36.720</td>\n <td>1.588630</td>\n <td>0.932774</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 20:00:00</th>\n <td>253412.80</td>\n <td>1.449961</td>\n <td>15.580</td>\n <td>36.812</td>\n <td>1.587185</td>\n <td>0.933383</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 21:00:00</th>\n <td>261648.90</td>\n <td>1.853027</td>\n <td>15.595</td>\n <td>36.948</td>\n <td>1.597107</td>\n <td>0.933327</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 22:00:00</th>\n <td>271429.70</td>\n <td>1.723446</td>\n <td>15.532</td>\n <td>37.160</td>\n <td>1.581778</td>\n <td>0.934369</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 23:00:00</th>\n <td>272750.00</td>\n <td>1.374762</td>\n <td>15.435</td>\n <td>37.279</td>\n <td>1.560663</td>\n <td>0.934365</td>\n <td>2022-01-26</td>\n </tr>\n </tbody>\n</table>\n<p>31296 rows × 7 columns</p>\n</div>"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data_ffill = hourly_data.ffill()\n",
"hourly_data_ffill"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": " flow rNOx rO2 temp rSO2 rsmoke \\\n2018-03-23 00:00:00 244136.19 24.212548 7.700 51.400 0.752494 1.014598 \n2018-03-23 01:00:00 234599.89 25.348257 7.800 51.300 0.642777 0.989539 \n2018-03-23 02:00:00 249264.88 17.562606 7.300 54.900 1.823513 1.137605 \n2018-03-23 03:00:00 229360.17 20.513299 7.500 52.700 1.650563 1.136867 \n2018-03-23 04:00:00 236416.45 15.354987 7.200 55.100 1.379341 1.128551 \n... ... ... ... ... ... ... \n2022-01-26 19:00:00 255639.10 1.859704 15.719 36.720 1.588630 0.932774 \n2022-01-26 20:00:00 253412.80 1.449961 15.580 36.812 1.587185 0.933383 \n2022-01-26 21:00:00 261648.90 1.853027 15.595 36.948 1.597107 0.933327 \n2022-01-26 22:00:00 271429.70 1.723446 15.532 37.160 1.581778 0.934369 \n2022-01-26 23:00:00 272750.00 1.374762 15.435 37.279 1.560663 0.934365 \n\n days \n2018-03-23 00:00:00 2018-03-23 \n2018-03-23 01:00:00 2018-03-23 \n2018-03-23 02:00:00 2018-03-23 \n2018-03-23 03:00:00 2018-03-23 \n2018-03-23 04:00:00 2018-03-23 \n... ... \n2022-01-26 19:00:00 2022-01-26 \n2022-01-26 20:00:00 2022-01-26 \n2022-01-26 21:00:00 2022-01-26 \n2022-01-26 22:00:00 2022-01-26 \n2022-01-26 23:00:00 2022-01-26 \n\n[31296 rows x 7 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n <th>days</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2018-03-23 00:00:00</th>\n <td>244136.19</td>\n <td>24.212548</td>\n <td>7.700</td>\n <td>51.400</td>\n <td>0.752494</td>\n <td>1.014598</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 01:00:00</th>\n <td>234599.89</td>\n <td>25.348257</td>\n <td>7.800</td>\n <td>51.300</td>\n <td>0.642777</td>\n <td>0.989539</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 02:00:00</th>\n <td>249264.88</td>\n <td>17.562606</td>\n <td>7.300</td>\n <td>54.900</td>\n <td>1.823513</td>\n <td>1.137605</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 03:00:00</th>\n <td>229360.17</td>\n <td>20.513299</td>\n <td>7.500</td>\n <td>52.700</td>\n <td>1.650563</td>\n <td>1.136867</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>2018-03-23 04:00:00</th>\n <td>236416.45</td>\n <td>15.354987</td>\n <td>7.200</td>\n <td>55.100</td>\n <td>1.379341</td>\n <td>1.128551</td>\n <td>2018-03-23</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>2022-01-26 19:00:00</th>\n <td>255639.10</td>\n <td>1.859704</td>\n <td>15.719</td>\n <td>36.720</td>\n <td>1.588630</td>\n <td>0.932774</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 20:00:00</th>\n <td>253412.80</td>\n <td>1.449961</td>\n <td>15.580</td>\n <td>36.812</td>\n <td>1.587185</td>\n <td>0.933383</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 21:00:00</th>\n <td>261648.90</td>\n <td>1.853027</td>\n <td>15.595</td>\n <td>36.948</td>\n <td>1.597107</td>\n <td>0.933327</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 22:00:00</th>\n <td>271429.70</td>\n <td>1.723446</td>\n <td>15.532</td>\n <td>37.160</td>\n <td>1.581778</td>\n <td>0.934369</td>\n <td>2022-01-26</td>\n </tr>\n <tr>\n <th>2022-01-26 23:00:00</th>\n <td>272750.00</td>\n <td>1.374762</td>\n <td>15.435</td>\n <td>37.279</td>\n <td>1.560663</td>\n <td>0.934365</td>\n <td>2022-01-26</td>\n </tr>\n </tbody>\n</table>\n<p>31296 rows × 7 columns</p>\n</div>"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data_bfill = hourly_data.bfill()\n",
"hourly_data_bfill"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"hourly_data_fixed = (hourly_data_ffill[num_cols] + hourly_data_ffill[num_cols]) / 2"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"data": {
"text/plain": " flow rNOx rO2 temp rSO2 rsmoke \\\n2020-12-01 12:00:00 49432.59 1.35584 19.1 34.7 0.124794 0.490263 \n\n days \n2020-12-01 12:00:00 2020-12-01 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n <th>days</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2020-12-01 12:00:00</th>\n <td>49432.59</td>\n <td>1.35584</td>\n <td>19.1</td>\n <td>34.7</td>\n <td>0.124794</td>\n <td>0.490263</td>\n <td>2020-12-01</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hourly_data_fixed['days'] = hourly_data_fixed.index.astype(str).to_series().apply(lambda x:x.split(' ')[0]).values\n",
"hourly_data_fixed[hourly_data_fixed.index=='2020-12-01 12:00:00']"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 特征工程将每天每小时的数据平铺开作为当天的24*6个特征"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": "['0_flow',\n '0_rNOx',\n '0_rO2',\n '0_temp',\n '0_rSO2',\n '0_rsmoke',\n '1_flow',\n '1_rNOx',\n '1_rO2',\n '1_temp',\n '1_rSO2',\n '1_rsmoke',\n '2_flow',\n '2_rNOx',\n '2_rO2',\n '2_temp',\n '2_rSO2',\n '2_rsmoke',\n '3_flow',\n '3_rNOx',\n '3_rO2',\n '3_temp',\n '3_rSO2',\n '3_rsmoke',\n '4_flow',\n '4_rNOx',\n '4_rO2',\n '4_temp',\n '4_rSO2',\n '4_rsmoke',\n '5_flow',\n '5_rNOx',\n '5_rO2',\n '5_temp',\n '5_rSO2',\n '5_rsmoke',\n '6_flow',\n '6_rNOx',\n '6_rO2',\n '6_temp',\n '6_rSO2',\n '6_rsmoke',\n '7_flow',\n '7_rNOx',\n '7_rO2',\n '7_temp',\n '7_rSO2',\n '7_rsmoke',\n '8_flow',\n '8_rNOx',\n '8_rO2',\n '8_temp',\n '8_rSO2',\n '8_rsmoke',\n '9_flow',\n '9_rNOx',\n '9_rO2',\n '9_temp',\n '9_rSO2',\n '9_rsmoke',\n '10_flow',\n '10_rNOx',\n '10_rO2',\n '10_temp',\n '10_rSO2',\n '10_rsmoke',\n '11_flow',\n '11_rNOx',\n '11_rO2',\n '11_temp',\n '11_rSO2',\n '11_rsmoke',\n '12_flow',\n '12_rNOx',\n '12_rO2',\n '12_temp',\n '12_rSO2',\n '12_rsmoke',\n '13_flow',\n '13_rNOx',\n '13_rO2',\n '13_temp',\n '13_rSO2',\n '13_rsmoke',\n '14_flow',\n '14_rNOx',\n '14_rO2',\n '14_temp',\n '14_rSO2',\n '14_rsmoke',\n '15_flow',\n '15_rNOx',\n '15_rO2',\n '15_temp',\n '15_rSO2',\n '15_rsmoke',\n '16_flow',\n '16_rNOx',\n '16_rO2',\n '16_temp',\n '16_rSO2',\n '16_rsmoke',\n '17_flow',\n '17_rNOx',\n '17_rO2',\n '17_temp',\n '17_rSO2',\n '17_rsmoke',\n '18_flow',\n '18_rNOx',\n '18_rO2',\n '18_temp',\n '18_rSO2',\n '18_rsmoke',\n '19_flow',\n '19_rNOx',\n '19_rO2',\n '19_temp',\n '19_rSO2',\n '19_rsmoke',\n '20_flow',\n '20_rNOx',\n '20_rO2',\n '20_temp',\n '20_rSO2',\n '20_rsmoke',\n '21_flow',\n '21_rNOx',\n '21_rO2',\n '21_temp',\n '21_rSO2',\n '21_rsmoke',\n '22_flow',\n '22_rNOx',\n '22_rO2',\n '22_temp',\n '22_rSO2',\n '22_rsmoke',\n '23_flow',\n '23_rNOx',\n '23_rO2',\n '23_temp',\n '23_rSO2',\n '23_rsmoke']"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_cols = [f\"{x}_{y}\" for x in range(24) for y in num_cols]\n",
"feature_cols"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": " flow \\\ndays \n2018-03-23 [244136.19, 234599.89, 249264.88, 229360.17, 2... \n2018-03-24 [234070.07, 235778.62, 231371.79, 234002.61, 2... \n2018-03-25 [228939.37, 232613.69, 229586.17, 235035.84, 2... \n2018-03-26 [231112.06, 225984.7, 224547.59, 221822.01, 21... \n2018-03-27 [226140.95, 219510.17, 215491.43, 205450.35, 2... \n... ... \n2022-01-22 [217544.3, 223416.0, 221987.6, 216571.4, 21647... \n2022-01-23 [204086.4, 213480.5, 207928.1, 210432.4, 20739... \n2022-01-24 [196331.5, 204396.6, 209247.7, 208345.4, 20784... \n2022-01-25 [241509.4, 251172.7, 222005.2, 216005.8, 21869... \n2022-01-26 [263819.8, 263171.3, 260461.9, 257509.9, 25479... \n\n rNOx \\\ndays \n2018-03-23 [24.212547548922526, 25.348256763471422, 17.56... \n2018-03-24 [21.236174270928696, 21.840986084108295, 21.63... \n2018-03-25 [17.854300390828886, 18.93070026209273, 18.963... \n2018-03-26 [25.934359751728444, 21.81253685318773, 21.535... \n2018-03-27 [11.968345200357508, 21.178438475476018, 26.25... \n... ... \n2022-01-22 [3.045354822347651, 2.8669384829826425, 3.2684... \n2022-01-23 [0.5965432853216215, 0.7573690670940121, 0.493... \n2022-01-24 [11.031038246691466, 11.518845457518038, 13.18... \n2022-01-25 [9.760638960096678, 9.68924708625155, 13.61718... \n2022-01-26 [3.085553795959118, 2.6547312605083198, 2.1764... \n\n rO2 \\\ndays \n2018-03-23 [7.7, 7.8, 7.3, 7.5, 7.2, 7.7, 7.3, 7.0, 6.2, ... \n2018-03-24 [7.4, 7.5, 7.4, 7.3, 7.5, 7.6, 7.5, 6.8, 5.7, ... \n2018-03-25 [7.4, 7.6, 7.8, 8.1, 8.0, 7.6, 7.4, 6.9, 5.9, ... \n2018-03-26 [7.5, 7.4, 7.5, 7.5, 7.5, 7.1, 7.0, 7.0, 5.7, ... \n2018-03-27 [6.9, 7.4, 7.5, 7.6, 7.6, 7.2, 7.3, 6.6, 5.7, ... \n... ... \n2022-01-22 [7.756, 7.856, 7.885, 8.05, 8.674, 8.493, 8.32... \n2022-01-23 [7.43, 8.015, 7.812, 7.805, 7.938, 8.12, 8.161... \n2022-01-24 [9.172, 9.131, 8.842, 8.987, 9.201, 9.489, 9.1... \n2022-01-25 [12.177, 11.365, 10.113, 10.497, 10.501, 10.84... \n2022-01-26 [15.38, 15.623, 15.897, 15.944, 15.919, 15.777... \n\n temp \\\ndays \n2018-03-23 [51.4, 51.3, 54.9, 52.7, 55.1, 51.9, 51.7, 52.... \n2018-03-24 [51.3, 51.6, 51.7, 52.0, 52.0, 51.9, 51.7, 52.... \n2018-03-25 [52.6, 52.4, 52.3, 52.2, 52.0, 53.3, 55.5, 53.... \n2018-03-26 [52.4, 52.3, 52.2, 52.6, 52.4, 54.1, 55.2, 53.... \n2018-03-27 [55.7, 53.0, 52.0, 52.0, 51.8, 52.2, 52.3, 53.... \n... ... \n2022-01-22 [55.539, 55.741, 55.812, 55.564, 55.491, 54.76... \n2022-01-23 [55.836, 55.577, 55.237, 55.573, 55.512, 54.88... \n2022-01-24 [52.855, 53.409, 53.889, 54.006, 53.944, 53.07... \n2022-01-25 [49.297, 48.991, 50.58, 50.893, 50.853, 50.416... \n2022-01-26 [37.445, 37.231, 36.495, 35.936, 35.819, 35.78... \n\n rSO2 \\\ndays \n2018-03-23 [0.7524938826881673, 0.642777189290924, 1.8235... \n2018-03-24 [0.2791006216657959, 0.287292450812319, 0.2956... \n2018-03-25 [5.332282211556822, 3.009154456372548, 4.68796... \n2018-03-26 [2.0988220157892563, 1.7874981882009273, 1.796... \n2018-03-27 [0.21695389751746164, 0.3281261324263327, 0.39... \n... ... \n2022-01-22 [0.6289737165296122, 0.6122342875306045, 0.625... \n2022-01-23 [0.5333190389186867, 0.
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n </tr>\n <tr>\n <th>days</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2018-03-23</th>\n <td>[244136.19, 234599.89, 249264.88, 229360.17, 2...</td>\n <td>[24.212547548922526, 25.348256763471422, 17.56...</td>\n <td>[7.7, 7.8, 7.3, 7.5, 7.2, 7.7, 7.3, 7.0, 6.2, ...</td>\n <td>[51.4, 51.3, 54.9, 52.7, 55.1, 51.9, 51.7, 52....</td>\n <td>[0.7524938826881673, 0.642777189290924, 1.8235...</td>\n <td>[1.0145984935121357, 0.9895385677241856, 1.137...</td>\n </tr>\n <tr>\n <th>2018-03-24</th>\n <td>[234070.07, 235778.62, 231371.79, 234002.61, 2...</td>\n <td>[21.236174270928696, 21.840986084108295, 21.63...</td>\n <td>[7.4, 7.5, 7.4, 7.3, 7.5, 7.6, 7.5, 6.8, 5.7, ...</td>\n <td>[51.3, 51.6, 51.7, 52.0, 52.0, 51.9, 51.7, 52....</td>\n <td>[0.2791006216657959, 0.287292450812319, 0.2956...</td>\n <td>[0.92187781095672, 0.8956764642972298, 0.92074...</td>\n </tr>\n <tr>\n <th>2018-03-25</th>\n <td>[228939.37, 232613.69, 229586.17, 235035.84, 2...</td>\n <td>[17.854300390828886, 18.93070026209273, 18.963...</td>\n <td>[7.4, 7.6, 7.8, 8.1, 8.0, 7.6, 7.4, 6.9, 5.9, ...</td>\n <td>[52.6, 52.4, 52.3, 52.2, 52.0, 53.3, 55.5, 53....</td>\n <td>[5.332282211556822, 3.009154456372548, 4.68796...</td>\n <td>[0.8255350027370751, 0.8260423997885425, 0.809...</td>\n </tr>\n <tr>\n <th>2018-03-26</th>\n <td>[231112.06, 225984.7, 224547.59, 221822.01, 21...</td>\n <td>[25.934359751728444, 21.81253685318773, 21.535...</td>\n <td>[7.5, 7.4, 7.5, 7.5, 7.5, 7.1, 7.0, 7.0, 5.7, ...</td>\n <td>[52.4, 52.3, 52.2, 52.6, 52.4, 54.1, 55.2, 53....</td>\n <td>[2.0988220157892563, 1.7874981882009273, 1.796...</td>\n <td>[0.9271904487422418, 0.944338665464641, 0.9108...</td>\n </tr>\n <tr>\n <th>2018-03-27</th>\n <td>[226140.95, 219510.17, 215491.43, 205450.35, 2...</td>\n <td>[11.968345200357508, 21.178438475476018, 26.25...</td>\n <td>[6.9, 7.4, 7.5, 7.6, 7.6, 7.2, 7.3, 6.6, 5.7, ...</td>\n <td>[55.7, 53.0, 52.0, 52.0, 51.8, 52.2, 52.3, 53....</td>\n <td>[0.21695389751746164, 0.3281261324263327, 0.39...</td>\n <td>[1.1014582489348053, 1.0853402841794082, 0.978...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>2022-01-22</th>\n <td>[217544.3, 223416.0, 221987.6, 216571.4, 21647...</td>\n <td>[3.045354822347651, 2.8669384829826425, 3.2684...</td>\n <td>[7.756, 7.856, 7.885, 8.05, 8.674, 8.493, 8.32...</td>\n <td>[55.539, 55.741, 55.812, 55.564, 55.491, 54.76...</td>\n <td>[0.6289737165296122, 0.6122342875306045, 0.625...</td>\n <td>[1.0424734269053981, 1.0191389782210867, 1.039...</td>\n </tr>\n <tr>\n <th>2022-01-23</th>\n <td>[204086.4, 213480.5, 207928.1, 210432.4, 20739...</td>\n <td>[0.5965432853216215, 0.7573690670940121, 0.493...</td>\n <td>[7.43, 8.015, 7.812, 7.805, 7.938, 8.12, 8.161...</td>\n <td>[55.836, 55.577, 55.237, 55.573, 55.512, 54.88...</td>\n <td>[0.5333190389186867, 0.5314856001529347, 0.561...</td>\n <td>[1.0451184794574826, 1.022569279389579, 1.0344...</td>\n </tr>\n <tr>\n <th>2022-01-24</th>\n <td>[196331.5, 204396.6, 209247.7, 208345.4, 20784...</td>\n <td>[11.031038246691466, 11.518845
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"daily_emiss_data = hourly_data_fixed.groupby('days').agg(list)\n",
"daily_emiss_data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [],
"source": [
"def merge(x1, x2, x3, x4, x5, x6):\n",
" return sum([x1, x2, x3, x4, x5, x6], [])"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": "Index(['flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke'], dtype='object')"
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"daily_emiss_data.columns"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [
{
"data": {
"text/plain": " 0_flow 0_rNOx 0_rO2 0_temp 0_rSO2 0_rsmoke \\\ndays \n2018-03-23 244136.19 234599.89 249264.88 229360.17 236416.45 236113.88 \n2018-03-24 234070.07 235778.62 231371.79 234002.61 224972.48 212372.97 \n2018-03-25 228939.37 232613.69 229586.17 235035.84 227862.00 233114.53 \n2018-03-26 231112.06 225984.70 224547.59 221822.01 219699.59 216640.45 \n2018-03-27 226140.95 219510.17 215491.43 205450.35 209391.39 202650.57 \n... ... ... ... ... ... ... \n2022-01-22 217544.30 223416.00 221987.60 216571.40 216474.30 217356.80 \n2022-01-23 204086.40 213480.50 207928.10 210432.40 207398.30 205233.10 \n2022-01-24 196331.50 204396.60 209247.70 208345.40 207840.00 203811.20 \n2022-01-25 241509.40 251172.70 222005.20 216005.80 218697.30 217854.50 \n2022-01-26 263819.80 263171.30 260461.90 257509.90 254797.00 255295.30 \n\n 1_flow 1_rNOx 1_rO2 1_temp ... 22_rO2 \\\ndays ... \n2018-03-23 243835.88 254941.06 263172.44 265048.62 ... 1.108454 \n2018-03-24 227885.84 252032.30 257109.81 252191.47 ... 1.139022 \n2018-03-25 224467.42 252500.03 240797.89 247235.60 ... 0.924819 \n2018-03-26 228010.51 255410.68 260558.74 253688.93 ... 0.849763 \n2018-03-27 207802.10 247740.44 254831.56 251766.47 ... 1.292986 \n... ... ... ... ... ... ... \n2022-01-22 215403.80 216200.90 224753.10 226410.50 ... 1.036403 \n2022-01-23 202121.80 203224.70 218894.90 238739.10 ... 1.024024 \n2022-01-24 204427.60 208614.90 204340.80 206945.40 ... 1.007186 \n2022-01-25 198460.70 199237.90 203870.10 224508.80 ... 0.936690 \n2022-01-26 255801.90 261359.50 267883.50 271961.30 ... 0.929660 \n\n 22_temp 22_rSO2 22_rsmoke 23_flow 23_rNOx 23_rO2 \\\ndays \n2018-03-23 1.149077 1.116597 1.070484 0.965953 1.005916 1.013989 \n2018-03-24 1.155038 1.161774 1.112287 0.960906 0.885317 0.892926 \n2018-03-25 0.975402 1.093083 0.968145 0.934758 0.900796 0.934184 \n2018-03-26 0.929620 0.984937 0.971792 0.953290 0.882604 0.943758 \n2018-03-27 1.344673 1.348917 1.185938 1.146697 1.155129 1.088680 \n... ... ... ... ... ... ... \n2022-01-22 1.037442 1.039126 1.045166 1.045001 1.037889 1.040583 \n2022-01-23 1.032411 1.035073 1.033276 1.035860 1.033211 1.031058 \n2022-01-24 1.008622 1.012684 1.021570 1.022493 1.015565 1.008908 \n2022-01-25 0.932645 0.935892 0.934892 0.933680 0.932380 0.928861 \n2022-01-26 0.929660 0.929660 0.929507 0.931257 0.932774 0.933383 \n\n 23_temp 23_rSO2 23_rsmoke \ndays \n2018-03-23 0.994012 0.978667 0.945210 \n2018-03-24 0.885317 0.870327 0.800264 \n2018-03-25 0.954531 0.960316 0.926621 \n2018-03-26 0.960906 0.995540 1.021165 \n2018-03-27 1.045839 0.937636 0.894574 \n... ... ... ... \n2022-01-22 1.039620 1.038964 1.041814 \n2022-01-23 1.025498 1.023998 1.022248 \n2022-01-24 0.994918 0.986704 0.987378 \n2022-01-25 0.928384 0.929663 0.932935 \n2022-01-26 0.933327 0.934369 0.934365 \n\n[1304 rows x 144 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0_flow</th>\n <th>0_rNOx</th>\n <th>0_rO2</th>\n <th>0_temp</th>\n <th>0_rSO2</th>\n <th>0_rsmoke</th>\n <th>1_flow</th>\n <th>1_rNOx</th>\n <th>1_rO2</th>\n <th>1_temp</th>\n <th>...</th>\n <th>22_rO2</th>\n <th>22_temp</th>\n <th>22_rSO2</th>\n <th>22_rsmoke</th>\n <th>23_flow</th>\n <th>23_rNOx</th>\n <th>23_rO2</th>\n <th>23_temp</th>\n <th>23_rSO2</th>\n <th>23_rsmoke</th>\n </tr>\n <tr>\n <th>days</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2018-03-23</th>\n <td>244136.19</td>\n <td>234599.89</td>\n <td>249264.88</td>\n <td>229360.17</td>\n <td>236416.45</td>\n <td>236113.88</td>\n <td>243835.88</td>\n <td>254941.06</td>\n <td>263172.44</td>\n <td>265048.62</td>\n <td>...</td>\n <td>1.108454</td>\n <td>1.149077</td>\n <td>1.116597</td>\n <td>1.070484</td>\n <td>0.965953</td>\n <td>1.005916</td>\n <td>1.013989</td>\n <td>0.994012</td>\n <td>0.978667</td>\n <td>0.945210</td>\n </tr>\n <tr>\n <th>2018-03-24</th>\n <td>234070.07</td>\n <td>235778.62</td>\n <td>231371.79</td>\n <td>234002.61</td>\n <td>224972.48</td>\n <td>212372.97</td>\n <td>227885.84</td>\n <td>252032.30</td>\n <td>257109.81</td>\n <td>252191.47</td>\n <td>...</td>\n <td>1.139022</td>\n <td>1.155038</td>\n <td>1.161774</td>\n <td>1.112287</td>\n <td>0.960906</td>\n <td>0.885317</td>\n <td>0.892926</td>\n <td>0.885317</td>\n <td>0.870327</td>\n <td>0.800264</td>\n </tr>\n <tr>\n <th>2018-03-25</th>\n <td>228939.37</td>\n <td>232613.69</td>\n <td>229586.17</td>\n <td>235035.84</td>\n <td>227862.00</td>\n <td>233114.53</td>\n <td>224467.42</td>\n <td>252500.03</td>\n <td>240797.89</td>\n <td>247235.60</td>\n <td>...</td>\n <td>0.924819</td>\n <td>0.975402</td>\n <td>1.093083</td>\n <td>0.968145</td>\n <td>0.934758</td>\n <td>0.900796</td>\n <td>0.934184</td>\n <td>0.954531</td>\n <td>0.960316</td>\n <td>0.926621</td>\n </tr>\n <tr>\n <th>2018-03-26</th>\n <td>231112.06</td>\n <td>225984.70</td>\n <td>224547.59</td>\n <td>221822.01</td>\n <td>219699.59</td>\n <td>216640.45</td>\n <td>228010.51</td>\n <td>255410.68</td>\n <td>260558.74</td>\n <td>253688.93</td>\n <td>...</td>\n <td>0.849763</td>\n <td>0.929620</td>\n <td>0.984937</td>\n <td>0.971792</td>\n <td>0.953290</td>\n <td>0.882604</td>\n <td>0.943758</td>\n <td>0.960906</td>\n <td>0.995540</td>\n <td>1.021165</td>\n </tr>\n <tr>\n <th>2018-03-27</th>\n <td>226140.95</td>\n <td>219510.17</td>\n <td>215491.43</td>\n <td>205450.35</td>\n <td>209391.39</td>\n <td>202650.57</td>\n <td>207802.10</td>\n <td>247740.44</td>\n <td>254831.56</td>\n <td>251766.47</td>\n <td>...</td>\n <td>1.292986</td>\n <td>1.344673</td>\n <td>1.348917</td>\n <td>1.185938</td>\n <td>1.146697</td>\n <td>1.155129</td>\n <td>1.0
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"daily_emiss_data['feature_cols'] = daily_emiss_data.apply(lambda row: merge(row['flow'], row['rNOx'], row['rO2'], row['temp'], row['rSO2'], row['rsmoke']), axis=1)\n",
"train_hourly_data = pd.DataFrame.from_records(np.array(daily_emiss_data['feature_cols'].values))\n",
"train_hourly_data.set_index(daily_emiss_data.index, inplace=True)\n",
"train_hourly_data.columns = feature_cols\n",
"train_hourly_data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [],
"source": [
"hourly_data_fixed['cNOx'] = hourly_data_fixed.flow * hourly_data_fixed.rNOx\n",
"hourly_data_fixed['cO2'] = hourly_data_fixed.flow * hourly_data_fixed.rO2\n",
"hourly_data_fixed['cSO2'] = hourly_data_fixed.flow * hourly_data_fixed.rSO2\n",
"hourly_data_fixed['csmoke'] = hourly_data_fixed.flow * hourly_data_fixed.rsmoke"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [
{
"data": {
"text/plain": "(1304, 11)"
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grp = hourly_data_fixed.groupby('days')\n",
"new_data = grp.agg({'cNOx': sum, 'cSO2':sum, 'cO2':sum, 'csmoke':sum,\n",
" 'flow':np.mean, 'rNOx':np.mean, 'rO2':np.mean, 'temp':np.mean,'rSO2':np.mean, 'rsmoke':np.mean}).reset_index()\n",
"new_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": "(1304, 155)"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_hourly_data = train_hourly_data.reset_index().merge(new_data, how='left', on='days')\n",
"final_hourly_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 30,
"outputs": [
{
"data": {
"text/plain": "(1108, 44)"
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"daily_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 31,
"outputs": [],
"source": [
"import seaborn as sns\n",
"from scipy.stats import norm"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\zhaojh\\Miniconda3\\envs\\py38\\lib\\site-packages\\seaborn\\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
" warnings.warn(msg, FutureWarning)\n"
]
},
{
"data": {
"text/plain": "<AxesSubplot:xlabel='燃料消耗量(吨)', ylabel='Density'>"
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkIAAAGtCAYAAAD6XRvKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB6oklEQVR4nO3dd3hUZdrA4d+ZmUx6TwiQQOgtdOkgQkQRcEFBRVQUFJWy69oRXEVF12VVFte1oQIi4IcUEQREBBELvQYMBEJJg0B6z7Tz/TFkIJA6mdR57uuaC2bOOe88580kefJWRVVVFSGEEEIIJ6Sp7QCEEEIIIWqLJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginJYmQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaelqO4C6zGKxYDKZ0Gg0KIpS2+EIIYQQogJUVcVisaDT6dBoym7zkUSoDCaTiaioqNoOQwghhBB26NKlC3q9vsxzJBEqQ1EW2aVLF7RabS1HUzPMZjNRUVFOdc9VJXVmH6m3ypM6qzypM/vU93orir+81iCQRKhMRd1hWq22Xn4QqsIZ77mqpM7sI/VWeVJnlSd1Zp/6Xm8VGdYig6WFEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOS1fbAQghhDPIzDeSXWAs8xxvNxd83V1qKCIhBEgiJIQQNSK7wMjB8xkYzZYSj7toNfQM95NESIgaJomQEELUEKPZQqGp5ERICFE7ZIyQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginJYmQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginJYmQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginJYmQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginJYmQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginJYmQEEIIIZyWJEJCCCGEcFqSCAkhhBDCaUkiJIQQQginVSuJUExMDOPGjaN3797MmzcPVVXLvWbv3r2MGDGCvn37snjx4gofmzp1Ku3bt7c9Jk2a5MhbEUIIIUQ9VuOJkMFgYOrUqURERLBmzRpiY2NZu3ZtmdekpaUxbdo0Ro0axcqVK9mwYQO7d+8u9xjAsWPH2LBhA/v27WPfvn189NFH1Xp/QgghhKg/ajwR2rlzJzk5OcyaNYvmzZvz7LPPsnr16jKvWb9+PcHBwcyYMYMWLVowffp02zVlHbt48SIA7dq1w8fHBx8fHzw8PKr3BoUQQghRb+hq+g1PnDhBt27dcHd3B6B9+/bExsaWec3Jkyfp168fiqIA0LVrV+bPn1/usaioKMxmM4MHDyYrK4uhQ4fy2muv4evrW6mYzWZzpc6vz4ru1ZnuuaqkzuzjbPWmWlQsFgsWi6XE4xaL9Zyy6sPZ6swRpM7sU9/rrTJx13gilJOTQ1hYmO25oihoNBoyMzNLTVBycnJo3bq17bmXlxfJycnlHjt79iwRERHMnDkTjUbDrFmzmD9/Pq+//nqlYo6KiqrU+Q2BM95zVUmd2ccZ6k2r1WJy9SExKZl8g6nEc9z1OpK9TKTGZ5X7Q9wZ6szRpM7s4wz1VuOJkFarRa/XF3vN1dWVgoKCUhOh668pOr+8Y0888QRPPPGE7djzzz/PU089VelEqEuXLmi12kpdU1+ZzWaioqKc6p6rSurMPs5Wb0kZBYTm6Cg0ldwi5KrTENIokKZ+4aWW4Wx15ghSZ/ap7/VWFH9F1Hgi5Ovry6lTp4q9lpubi4uLS5nXpKWllXh+Wceu5+PjQ3p6OgaD4YZkrCxarbZefhCqwhnvuaqkzuzjLPWmaKyt35pSRmZqNBoUjVKhunCWOnMkqTP7OEO91fhg6S5dunDkyBHb84SEBAwGQ5njdq6/Jjo6mpCQkHKPPfXUUxw+fNh2LCoqiuDg4EolQUIIIYRouGo8EerduzfZ2dmsW7cOgIULFzJgwAC0Wi05OTkYjcYbromMjOTAgQPs3r0bk8nEokWLGDRoULnH2rVrx9tvv82RI0f4+eefef/995kwYUKN3asQQggh6rYa7xrT6XTMnTuX5557jn//+9+YzWaWLVsGwOjRo5k9ezbDhg0rdk1AQAAzZ85kypQpeHl54eHhwVtvvVXusSeffJKkpCQmT55MYGAg999/P08++WTN3rAQQggh6qwaT4QAhg0bxo8//khUVBQ9e/YkICAAgO3bt5d6zYMPPsigQYOIjY2lT58+eHl5lXvMxcWFf/7zn/zzn/+s3hsSQgghRL1UK4kQQEhIiG0sT0WFh4cTHl7yjIqyjgkhhBBClEQ2XRVCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjitWttrTAghGorMfCPZBcZSj2sUKDRaajAiIURFSSIkhBBVlF1g5OD5DIzmkpMdT72O8CCPGo5KCFERkggJIYQDGM0WCk0lJ0J6rbQGCVFXyRghIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCCCGE05JESAghhBBOSxIhIYQQQjgtSYSEEEII4bQkERJCiDpCqcA5Go382BbCkXS1HYAQQgjQahTMFpWE9LxSz1EtKnov3xqMSoiGTxIhIYSoA7SKQq7BzKnkHIxmS8nnaCDMrYYDE6KBk0RICCHqEKPZQqGp5ETIRXrFhHA4+bYSQgghhNOSREgIIWpJZr6RlJzC2g5DCKcmXWNCCFELjiZksOZgAkazSpCXnl7hAUwLbF3bYQnhdCQREkKIGqSqKlv/TGZHzGXbayk5Bn44fhGtRiGyQ6NajE4I5yOJkBBC1KAjCRm2JOjmtkEMadeIqMRMvjucyMaoC/h7utCpiUyRF6KmyBghIYSoIWaLyrboSwBEdmjEiM5NcNdr6dMygNs7hQCwan+CjBsSogZJIiSEEDXkUHw6qbkGPPRabm4TVOzYiM5N6BzqQ6HJwg/HLtZShEI4H0mEhBCiBpgt1rFBADe3DcbVRVvsuFaj8LfItgBEX8giVVqFhKgRkggJIUQN2HHyEik51tagfq0CSjyneYAHHRt7owK/nU6p2QCFcFKSCAkhRA346crYoIFtgnDVaUs9b0h766yxg3Hp5BWaaiQ2IZyZJEJCCFHNcgtNHE3IAKBraNkzwloHe9LU1w2jWWXPubQaiE4I5yaJkBBCVLNjSZlYVGji60agl2uZ5yqKwqC21oHUe8+mYVHVmghRCKcliZAQQlSzIwmZAHQppzWoSOemvrjqNGTmG4lLzavO0IRwepIICSFENTKYLJy4mAVUPBHSaTV0auIDwNHEzGqLTQghiZAQQlSrmORsjGaVxj5uNPF1q/B1XcOsSdPxxEzpHhOiGkkiJIQQ1ejPC9bWoAFtAlEUpcLXtW7khbuLluxCE2dTcqsrPCGcniRCQghRTVRV5VRyNgD9WwVW6lqdRkNEU2v3WFSCdI8JUV0kERJCiGqSkmMg12DGRavQLsS70td3udI9diwpE7NFuseEqA6SCAkhRDU5n2rt0goP8ESvq/yP21ZBXnjoteQZzMSlyewxIaqDJEJCCFFNzl2Z+t4q2NOu67UahbaNvADroGshhOPVSiIUExPDuHHj6N27N/PmzUOtwIyIvXv3MmLECPr27cvixYsrfKyI0WjkL3/5C3v27HHIPQghRHnOXWkRah3sZXcZ7Rtbu9QkERKietR4ImQwGJg6dSoRERGsWbOG2NhY1q5dW+Y1aWlpTJs2jVGjRrFy5Uo2bNjA7t27yz12rc8//5yYmJhquSchhLh
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.distplot(daily_data['燃料消耗量(吨)'], fit=norm)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 33,
"outputs": [
{
"data": {
"text/plain": "<Figure size 4000x4000 with 43 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAADFUAAAxICAYAAABKB0GsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZhe4+E//vdsiSxM7HuioYKIoEGFWqM0qJ2iltppFYkifJCKIvZ9r30XxFbUVmqv2oJGdGyJJUEiMtkms/z+8MvzNc1iEklmkuf1uq5elzn3Oc+5z+Sa53r3PM/73CUNDQ0NAQAAAAAAAAAAAAAAKDKlzT0BAAAAAAAAAAAAAACA5qBUAQAAAAAAAAAAAAAAFCWlCgAAAAAAAAAAAAAAoCgpVQAAAAAAAAAAAAAAAEVJqQIAAAAAAAAAAAAAAChKShUAAAAAAAAAAAAAAEBRUqoAAAAAAAAAAAAAAACKklIFAAAAAAAAAAAAAABQlJQqAAAAKCoNDQ3NPQUAAAAAAAAAAFoIpQoAAADmua+//jpjx46d5+etqanJdtttlzvvvHO2X6O6ujojR45MktTW1uaAAw7Iv//97zk1RQAA/n81NTV54403Zpgb99tvv5xxxhkzPP7LL7/MiBEjZjg+duzYfP311z96ngAAzF3PPPNM7r777ibv//nnn+e+++7LlClTpjteVVWVurq6OTU9AADmoDn5gLwnn3wyRx555Bx7PWDBplQBAADAPHfuuedm0003LZQTmuqkk05K7969kyRjxozJVlttldtvv73Jx99xxx0ZNWpUfvGLXzT5mJEjR+b111/P3XffneOPPz6bbrpp+vXrl4aGhgwdOjTPP/982rZtO0vXAQDADxszZkz22GOPvPXWW9MdLysrS3l5+QyPv+iii3L00UfPcPy8887LIYcc8mOnCQDAXDBp0qTCfz/zzDO55557Cj9PnDgxn3322QyPfeSRR9K/f/+MHj06STJ69OicffbZ+fLLL/PZZ5+ld+/eeeedd+be5AEAmC0vvvhifvvb32bMmDFJklGjRuWTTz7J559/ni+++KLR/yZPnvyDr1dZWZm///3vefXVVwvb6uvrM3HixNTU1My16wDmTzP+tAEAAADmgsceeyz33XdflltuuVx22WU57bTTmnzslClTCk8Tbt++fT755JMmP1Xus88+y0UXXZTq6upsvvnm091n5ZVXzuDBg9OqVavCtgMOOCBjx47NwgsvnMmTJ+e0007LGmuskSR57bXXsuqqq2b11Vdv8jUAANA0UzNZRUVFYdsBBxyQXXbZJdtuu20qKipSWjrjZ0e1atWqUa6b3vj3XxsAgJZhypQpOeKII7LCCivktNNOS1lZWaPcd+edd+bSSy/Nn//852y77bbTHP/ggw9mjz32yNJLL50kefbZZ3PTTTflgAMOyHLLLZeVV145zz//fNZaa615dk0AAPywm2++OZWVlVl00UWTJBdccEH+9re/ZaGFFirsM2HChLRv3z6PPfZYWrdunSQ/+FC9vffee5pt/fv3z5577jmHrwCYnylVAAAAMM+89tprOf7443PggQdmjz32yI477pjFFlssRx11VEpKSn7w+LZt2xZujlVUVKSsrKxJq0RMnDgxffr0yfrrr59zzz13mvGHH344/fv3zxlnnDHNF+8GDRqUNm3a5Pbbb88DDzzQ6IPa1157LcOGDUuXLl2mec37778/q6222g/ODQCA6SsrK0uSRjlx5MiRmTBhwnT3/+CDD9K5c+dGx099jempqKhQqgAAaGHGjx+fvn375oMPPsjJJ5+cJNOUKnr27Jk2bdrkhBNOyL///e+ceOKJhRXMnn322bz//vu54oorCvs/8sgj+eUvf5klllgiSbL99ttn0KBBOfTQQ2da0gUAYN4ZMmRIXnrppTz88MNJkurq6px55pk588wzC/t8/vnn2XHHHXPiiSdmkUUWKWyfmgXvvvvuwue2J554Yvbcc89069YtSfLuu+/moosuyvnnn5+ysrLCZ84AU/l/hwAAAMwTzzzzTA488MBsvfXW6dOnTzp27JjLLrssN910U4444oiMHDmy0f4NDQ154YUXGi3HWlFRUbgpNj3nnXdennrqqUbbqqurc8QRR+Sbb75Jnz59MnDgwJSXl6ddu3Zp165dvv3225xzzjk55JBDsvbaa0/zmm3atJnuuWpqavLPf/4z119/fZ555pnC/w466KBUVlZm1VVXnYXfDgAAUzU0NGTChAmpra1NktTX12fChAmpr69PWVlZoyLEhx9+mHPOOSdbb711DjzwwJm+7ujRo7PVVlvllVdemavzBwBg9rz55pvZaaedMmrUqNx55535yU9+UhgrKSnJJ598kj59+mSHHXbIEksskRtvvDGPPPJIXnrppSTf5cjLLrssq666apZZZpkkyfvvv59nnnkmv/vd7wqvteuuu+arr77Ko48+Om8vEACA6WpoaMiZZ56ZP/7xj1l22WXz1ltvZcstt8ywYcMK+4wdOzaHHnpottpqq2y//faNjp9alC0vL09DQ0MqKiry9NNPZ+zYsWndunVat26d2travPjii1l44YXTtm1b5VpgGlaqAAAAYK765ptvcskll+TWW2/Nvvvum379+qWkpCR33XVX1l577dx8883p06dPfvnLX2afffbJHnvskRVXXDH19fXp169fVl555Vx33XU/eJ7q6upce+212W233bLFFlsk+a748Nvf/jajRo3K7bffns8++ywPPfRQ/vvf/+aKK65IfX19Dj300Kyyyir5wx/+MM1r1tXVZfTo0amoqMjkyZPT0NCQcePGpba2Nq+88krat2+fnj17Njrm008/TY8ePdyIAwCYTWPGjMmGG25Y+Hn//fdPkjz55JMpKSnJu+++m1dffTWvvPJKKioq0q5du6y77rqFMu5NN92Ul156KcOGDcu4ceNy2GGHpVOnTvnDH/6QTz75pFFOmzhxYqqqqlJXV5eampqsueaa8/RaAQD4f37yk59kp512yoEHHlhYTXby5MkZOXJk3n777eywww7p3bt3Hn744cIKZY8//njat2+fJBk8eHDeeOONdO3aNcn/+3LeZpttVnhCcZIsueSS2XvvvXPGGWdk4403bvSUYwAA5r3LLrsstbW12XfffTN58uScccYZ2WCDDQoPsRs5cmQOPfTQfPDBB1lppZXy9ddfZ/HFF5/mdYYOHZqddtop5eXlqa2tzVFHHVVYBbehoSFJss4666S2tjZ9+vTJIYccMu8uEmjxlCoAAACYK7766qvcfPPNuf3229PQ0JCLLrooW2+9dWH8rLPOynHHHZff/OY3eeCBB3LppZfmpptuyjXXXJPdd989AwYMyM4775wrr7wyX3zxReHpcjPy0ksvpb6+PrvuumthW6tWrXLGGWekoqIinTp1SqdOnXLHHXfkgAMOyF577ZWKiorU1NTk+uuvn+4KGCNGjMgvf/nLRtt69OiRn/zkJ+nQoUM233zzJMmgQYMyePDg3HLLLfnPf/6TPffc88f86gAAilqHDh3y4osvZvLkydlss81y9dVXp6SkJJdddlk++uijjBo1Kr/61a+y0korZaONNsqxxx6be++9t1CqWG655dK1a9eMHj06paWl6dmzZzp06FAoU3x/JbJ33303u+22WxoaGjJ58uS8++67zXLNAAAkiyyySA4//PBMmDAhzz33XB5//PE88cQTmTBhQnr27Jntt98+HTp0yIgRIzJixIi0b98+6667bpLk888/z1lnnZWVVlqp8Ho33nhjXnnllQwePHiac/3hD3/IE088keOOOy6XXHJJo9XQAACYd2pqanLrrbdmwoQJ6d69e2pqarLSSivl2muvTZI88cQT6d+/fzbaaKP89a9/zVlnnZXevXunX79+2XHHHZN8t9JtknTr1i3vvPNOysvLs+666+a8884rfJ776quvZu+9987bb7+dhoaG1NXVNcv1Ai2XUgUAAABzRUlJSQYPHpzNNtssf/rTnzJkyJC8/vrrWWeddZIkFRUVhSJD69at07dv3+y///6
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = plt.figure(figsize=(40, 40))\n",
"for index, col in enumerate(daily_data.columns):\n",
" if col != 'days':\n",
" # u = x_data[col].mean()\n",
" # std = x_data[col].std()\n",
" try:\n",
" plt.subplot(9,5,index+1)\n",
" plt.title(col)\n",
" plt.hist(daily_data[col])\n",
" except:\n",
" print(col)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 34,
"outputs": [],
"source": [
"use_cols = ['days']\n",
"for col in daily_data.columns:\n",
" if col != 'days':\n",
" if daily_data[col].value_counts().shape[0] > 1 and daily_data[col].isna().sum() / daily_data.shape[0] <= 0.01:\n",
" use_cols.append(col)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 35,
"outputs": [
{
"data": {
"text/plain": "['days',\n '发电量(千瓦时)',\n '供热量(吉焦)',\n '机组运行时间(小时)',\n '硫分(%',\n '脱硫剂使用量(吨)',\n '脱硫设施运行时间(小时)',\n '脱硝还原剂消耗量(吨)',\n '脱硝运行时间(小时)',\n '燃料消耗量(吨)']"
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"use_cols"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 36,
"outputs": [
{
"data": {
"text/plain": "(1108, 10)"
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp_data = daily_data[use_cols].copy()\n",
"tmp_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 37,
"outputs": [],
"source": [
"train_data = tmp_data.merge(final_hourly_data, on='days', how='left')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"data": {
"text/plain": "(1108, 164)"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [
{
"data": {
"text/plain": "24.0 1103\n21.0 2\n0.0 1\n15.5 1\n19.0 1\nName: 机组运行时间(小时), dtype: int64"
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data['机组运行时间(小时)'].value_counts()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [
{
"data": {
"text/plain": "(1087, 164)"
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_data = train_data[~((train_data['机组运行时间(小时)']==0)|(train_data['燃料消耗量(吨)']<=200)|(train_data.flow==0)|(train_data.flow.isna()))].copy()\n",
"valid_data.shape"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [],
"source": [
"import datetime as dt"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [],
"source": [
"def cal_timedelta(x):\n",
" date = dt.datetime.strptime(x, '%Y-%m-%d')\n",
" date = dt.date(date.year, date.month, date.day)\n",
" start_date = dt.date(date.year, 1, 1)\n",
" time_delta = (date - start_date).days\n",
" return time_delta"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 43,
"outputs": [
{
"data": {
"text/plain": " days 发电量(千瓦时) 供热量(吉焦) 机组运行时间(小时) 硫分(% 脱硫剂使用量(吨) \\\n0 2018-10-01 156796.00 6536.83 24.0 0.51 5.06 \n1 2018-10-02 133984.00 2484.64 24.0 0.51 5.04 \n2 2018-10-03 134023.00 3020.83 24.0 0.51 5.04 \n3 2018-10-04 124765.00 5599.23 24.0 0.51 5.03 \n4 2018-10-05 134414.00 4702.65 24.0 0.51 5.06 \n... ... ... ... ... ... ... \n1103 2022-01-22 52.24 12472.00 24.0 0.59 8.46 \n1104 2022-01-23 51.36 12051.00 24.0 0.59 8.46 \n1105 2022-01-24 51.12 11276.00 24.0 0.59 8.43 \n1106 2022-01-25 49.32 11007.00 24.0 0.59 8.43 \n1107 2022-01-26 29.64 8132.00 24.0 0.59 8.44 \n\n 脱硫设施运行时间(小时) 脱硝还原剂消耗量(吨) 脱硝运行时间(小时) 燃料消耗量(吨) ... cSO2 \\\n0 24.0 2.98 24.0 323 ... 1.810937e+07 \n1 24.0 2.97 24.0 218 ... 1.337057e+07 \n2 24.0 2.95 24.0 212 ... 2.404455e+07 \n3 24.0 2.98 24.0 223 ... 8.668474e+06 \n4 24.0 3.01 24.0 243 ... 1.579668e+06 \n... ... ... ... ... ... ... \n1103 24.0 4.56 24.0 822 ... 3.028139e+06 \n1104 24.0 4.58 24.0 790 ... 3.412421e+06 \n1105 24.0 4.57 24.0 751 ... 4.146250e+06 \n1106 24.0 4.56 24.0 672 ... 3.971702e+06 \n1107 24.0 4.57 24.0 484 ... 5.050733e+06 \n\n cO2 csmoke flow rNOx rO2 \\\n0 3.745944e+07 5.495410e+05 162345.192917 24.417792 9.900000 \n1 2.832146e+07 3.078217e+05 140175.330833 18.705945 9.400000 \n2 3.174159e+07 4.348207e+05 154686.184167 20.891791 8.550000 \n3 2.511504e+07 1.946970e+06 120345.545833 18.457892 10.202083 \n4 4.106346e+07 5.390776e+06 162533.103542 22.017321 11.497917 \n... ... ... ... ... ... \n1103 4.149625e+07 5.438282e+06 218349.604167 2.174717 7.921417 \n1104 4.422277e+07 5.194162e+06 210121.608333 5.565075 8.756333 \n1105 4.655727e+07 5.133802e+06 211378.329167 10.326585 9.110167 \n1106 7.959093e+07 5.497492e+06 240801.208333 4.874698 13.636042 \n1107 9.866431e+07 5.879454e+06 263197.579167 1.481812 15.621583 \n\n temp rSO2 rsmoke day_of_year \n0 51.250000 4.705029 0.182338 273 \n1 50.679167 3.675542 0.166718 274 \n2 52.808333 6.440365 0.117143 275 \n3 48.854167 2.364306 0.761071 276 \n4 45.783333 0.339330 1.858999 277 \n... ... ... ... ... \n1103 55.441542 0.576979 1.037822 21 \n1104 54.574333 0.678481 1.030052 22 \n1105 53.031042 0.818827 1.012412 23 \n1106 42.908458 0.698443 0.953106 24 \n1107 36.412917 0.801224 0.930781 25 \n\n[1087 rows x 165 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>days</th>\n <th>发电量(千瓦时)</th>\n <th>供热量(吉焦)</th>\n <th>机组运行时间(小时)</th>\n <th>硫分(%</th>\n <th>脱硫剂使用量(吨)</th>\n <th>脱硫设施运行时间(小时)</th>\n <th>脱硝还原剂消耗量(吨)</th>\n <th>脱硝运行时间(小时)</th>\n <th>燃料消耗量(吨)</th>\n <th>...</th>\n <th>cSO2</th>\n <th>cO2</th>\n <th>csmoke</th>\n <th>flow</th>\n <th>rNOx</th>\n <th>rO2</th>\n <th>temp</th>\n <th>rSO2</th>\n <th>rsmoke</th>\n <th>day_of_year</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>2018-10-01</td>\n <td>156796.00</td>\n <td>6536.83</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>5.06</td>\n <td>24.0</td>\n <td>2.98</td>\n <td>24.0</td>\n <td>323</td>\n <td>...</td>\n <td>1.810937e+07</td>\n <td>3.745944e+07</td>\n <td>5.495410e+05</td>\n <td>162345.192917</td>\n <td>24.417792</td>\n <td>9.900000</td>\n <td>51.250000</td>\n <td>4.705029</td>\n <td>0.182338</td>\n <td>273</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2018-10-02</td>\n <td>133984.00</td>\n <td>2484.64</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>5.04</td>\n <td>24.0</td>\n <td>2.97</td>\n <td>24.0</td>\n <td>218</td>\n <td>...</td>\n <td>1.337057e+07</td>\n <td>2.832146e+07</td>\n <td>3.078217e+05</td>\n <td>140175.330833</td>\n <td>18.705945</td>\n <td>9.400000</td>\n <td>50.679167</td>\n <td>3.675542</td>\n <td>0.166718</td>\n <td>274</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2018-10-03</td>\n <td>134023.00</td>\n <td>3020.83</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>5.04</td>\n <td>24.0</td>\n <td>2.95</td>\n <td>24.0</td>\n <td>212</td>\n <td>...</td>\n <td>2.404455e+07</td>\n <td>3.174159e+07</td>\n <td>4.348207e+05</td>\n <td>154686.184167</td>\n <td>20.891791</td>\n <td>8.550000</td>\n <td>52.808333</td>\n <td>6.440365</td>\n <td>0.117143</td>\n <td>275</td>\n </tr>\n <tr>\n <th>3</th>\n <td>2018-10-04</td>\n <td>124765.00</td>\n <td>5599.23</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>5.03</td>\n <td>24.0</td>\n <td>2.98</td>\n <td>24.0</td>\n <td>223</td>\n <td>...</td>\n <td>8.668474e+06</td>\n <td>2.511504e+07</td>\n <td>1.946970e+06</td>\n <td>120345.545833</td>\n <td>18.457892</td>\n <td>10.202083</td>\n <td>48.854167</td>\n <td>2.364306</td>\n <td>0.761071</td>\n <td>276</td>\n </tr>\n <tr>\n <th>4</th>\n <td>2018-10-05</td>\n <td>134414.00</td>\n <td>4702.65</td>\n <td>24.0</td>\n <td>0.51</td>\n <td>5.06</td>\n <td>24.0</td>\n <td>3.01</td>\n <td>24.0</td>\n <td>243</td>\n <td>...</td>\n <td>1.579668e+06</td>\n <td>4.106346e+07</td>\n <td>5.390776e+06</td>\n <td>162533.103542</td>\n <td>22.017321</td>\n <td>11.497917</td>\n <td>45.783333</td>\n <td>0.339330</td>\n <td>1.858999</td>\n <td>277</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_data['day_of_year'] = valid_data.days.apply(cal_timedelta)\n",
"valid_data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [],
"source": [
"valid_data.to_csv(\n",
" './train_data.csv', encoding='utf-8-sig', index=False\n",
")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}