emission_detect_ai/data_analysis.ipynb

520 KiB
Raw Blame History

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
#新增加的两行
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]

mpl.rcParams["axes.unicode_minus"] = False
In [2]:
daily_data = pd.read_excel('./data/机器学习样表.xlsx',sheet_name=0, header=[0, 1])
old_cols = daily_data.columns
new_cols = [x[0].strip() if 'Unnamed' in x[1] else x[0]+'_'+x[1] for x in old_cols]
daily_data.columns = new_cols
daily_data.head()
Out[2]:
日期 企业名称 地址 省份 经度 纬度 烟囱高度m 脱硝工艺 脱硝剂名称 脱硝设备数量 ... 供热量(吉焦) 产渣量(吨) 机组运行时间(小时) 硫分(% 脱硫副产品产量(吨) 脱硫剂使用量(吨) 脱硫设施运行时间(小时) 脱硝还原剂消耗量(吨) 脱硝运行时间(小时) 燃料消耗量(吨)
0 2018-10-01 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 SNCR SCR 氨水 3 ... 6536.83 NaN 24.0 0.51 NaN 5.06 24.0 2.98 24.0 323
1 2018-10-02 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 SNCR SCR 氨水 3 ... 2484.64 NaN 24.0 0.51 NaN 5.04 24.0 2.97 24.0 218
2 2018-10-03 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 SNCR SCR 氨水 3 ... 3020.83 NaN 24.0 0.51 NaN 5.04 24.0 2.95 24.0 212
3 2018-10-04 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 SNCR SCR 氨水 3 ... 5599.23 NaN 24.0 0.51 72.52 5.03 24.0 2.98 24.0 223
4 2018-10-05 浙江秀舟热电有限公司 嘉兴市南湖区凤桥镇 浙江省 120°515.54″ 30°3914.76″ 80 SNCR SCR 氨水 3 ... 4702.65 NaN 24.0 0.51 NaN 5.06 24.0 3.01 24.0 243

5 rows × 44 columns

In [3]:
daily_data.rename(columns={"日期": "days"}, inplace=True)
daily_data.days = daily_data.days.astype(str)
In [4]:
daily_data.shape
Out[4]:
(1178, 44)
In [5]:
hourly_data = pd.read_excel('./data/机器学习样表.xlsx',sheet_name=1).drop_duplicates()
hourly_data.columns = ['date', 'flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke']
hourly_data.date = hourly_data.date.astype("datetime64")
ori_hourly_data = hourly_data.copy()
hourly_data
Out[5]:
date flow rNOx rO2 temp rSO2 rsmoke
0 2018-03-23 00:00:00 244136.19 28.6370 7.700 51.400 0.8900 1.2000
1 2018-03-23 01:00:00 234599.89 29.9710 7.800 51.300 0.7600 1.1700
2 2018-03-23 02:00:00 249264.88 20.9960 7.300 54.900 2.1800 1.3600
3 2018-03-23 03:00:00 229360.17 24.3590 7.500 52.700 1.9600 1.3500
4 2018-03-23 04:00:00 236416.45 18.3680 7.200 55.100 1.6500 1.3500
... ... ... ... ... ... ... ...
33714 2022-01-26 19:00:00 255639.10 2.1000 15.719 36.720 1.7939 1.0533
33715 2022-01-26 20:00:00 253412.80 1.6378 15.580 36.812 1.7928 1.0543
33716 2022-01-26 21:00:00 261648.90 2.0940 15.595 36.948 1.8048 1.0547
33717 2022-01-26 22:00:00 271429.70 1.9489 15.532 37.160 1.7887 1.0566
33718 2022-01-26 23:00:00 272750.00 1.5552 15.435 37.279 1.7655 1.0570

33715 rows × 7 columns

In [7]:
hourly_data['rNOx'] = hourly_data.apply((lambda x: x['rNOx'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)
hourly_data['rSO2'] = hourly_data.apply((lambda x: x['rSO2'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)
hourly_data['rsmoke'] = hourly_data.apply((lambda x: x['rsmoke'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)

我们将每天24个小时的数据作为特征因此对数据不足24小时的要先填充Nan以备后续处理

In [8]:
hour_range = pd.date_range(hourly_data.date.min(), hourly_data.date.max(), freq='H')
hour_range.shape[0], hourly_data.shape[0]
Out[8]:
(33744, 33715)

可见少了约30条数据因此进行index对齐

In [9]:
hourly_data = hourly_data.set_index("date").reindex(hour_range)
hourly_data['days'] = hourly_data.index.astype(str).to_series().apply(lambda x:x.split(' ')[0]).values

异常值处理

对于出现的负值,若一天之中出现的较多,这一天的数据视为脏数据,可以统一处理

In [10]:
hourly_data[hourly_data.rNOx < 0].iloc[0]
Out[10]:
flow        49432.59
rNOx       -0.338727
rO2             19.1
temp            34.7
rSO2        0.124794
rsmoke      0.490263
days      2020-12-01
Name: 2020-12-01 12:00:00, dtype: object
In [11]:
hourly_data[hourly_data._get_numeric_data() < 0] = np.nan
hourly_data[hourly_data.index=='2020-12-01 12:00:00']
Out[11]:
flow rNOx rO2 temp rSO2 rsmoke days
2020-12-01 12:00:00 49432.59 NaN 19.1 34.7 0.124794 0.490263 2020-12-01

缺失值分析

In [12]:
# 特征列
num_cols = [x for x in hourly_data.columns if not x.startswith('da')]
num_cols
Out[12]:
['flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke']
In [13]:
# 写一个逻辑首先统计出每天有缺失数据的比例对任一数据缺失记录高于4条的判断该天在daily_data是否有生产记录若无则为脏数据需要删去
del_date = list()
for col in num_cols:
    na_counts = hourly_data[hourly_data[col].isna()].days.value_counts().to_dict()
    for date in na_counts:
        if na_counts.get(date) < 4:
            continue
        try:
            if date in del_date:
                continue
            if daily_data[daily_data.days==date].shape[0] == 0:
                del_date.append(date)
                continue
            if daily_data[daily_data.days==date]['发电量(千瓦时)'].values[0] > 100000 or  daily_data[daily_data.days==date]['供热量(吉焦)'].values[0] > 2500:
                # 取缺失率高且有较大发电量的天作为删除值
                del_date.append(date)
        except:
            print(date)
In [14]:
len(del_date)
Out[14]:
101
In [15]:
# 删掉不要的脏数据
hourly_data = hourly_data[~hourly_data.days.isin(del_date)].copy()
daily_data = daily_data[~daily_data.days.isin(del_date)].copy()
hourly_data.shape, daily_data.shape
Out[15]:
((31320, 7), (1108, 44))

最后看一下有无哪一天仍有很多缺失数据

In [16]:
hourly_data[hourly_data[num_cols].isnull().T.any()].days.value_counts()
Out[16]:
2019-04-08    22
2019-07-17     3
2018-12-19     3
2021-02-18     3
2019-01-16     3
              ..
2019-03-12     1
2019-03-17     1
2019-03-27     1
2019-04-04     1
2019-05-24     1
Name: days, Length: 220, dtype: int64

于是再去掉2019-04-08的数据

In [17]:
hourly_data = hourly_data[hourly_data.days!='2019-04-08'].copy()
hourly_data.shape
Out[17]:
(31296, 7)

缺失值补充

  1. 对Nan值 取其上下两个时刻的均值作为填充值
  2. 对于多个连续缺失的值,实际上应该用窗口法填充,但是难度太大,因此仍用均值填充
  3. 使用ffill和bfill然后合并取均值。
In [18]:
hourly_data_ffill = hourly_data.ffill()
hourly_data_ffill
Out[18]:
flow rNOx rO2 temp rSO2 rsmoke days
2018-03-23 00:00:00 244136.19 24.212548 7.700 51.400 0.752494 1.014598 2018-03-23
2018-03-23 01:00:00 234599.89 25.348257 7.800 51.300 0.642777 0.989539 2018-03-23
2018-03-23 02:00:00 249264.88 17.562606 7.300 54.900 1.823513 1.137605 2018-03-23
2018-03-23 03:00:00 229360.17 20.513299 7.500 52.700 1.650563 1.136867 2018-03-23
2018-03-23 04:00:00 236416.45 15.354987 7.200 55.100 1.379341 1.128551 2018-03-23
... ... ... ... ... ... ... ...
2022-01-26 19:00:00 255639.10 1.859704 15.719 36.720 1.588630 0.932774 2022-01-26
2022-01-26 20:00:00 253412.80 1.449961 15.580 36.812 1.587185 0.933383 2022-01-26
2022-01-26 21:00:00 261648.90 1.853027 15.595 36.948 1.597107 0.933327 2022-01-26
2022-01-26 22:00:00 271429.70 1.723446 15.532 37.160 1.581778 0.934369 2022-01-26
2022-01-26 23:00:00 272750.00 1.374762 15.435 37.279 1.560663 0.934365 2022-01-26

31296 rows × 7 columns

In [19]:
hourly_data_bfill = hourly_data.bfill()
hourly_data_bfill
Out[19]:
flow rNOx rO2 temp rSO2 rsmoke days
2018-03-23 00:00:00 244136.19 24.212548 7.700 51.400 0.752494 1.014598 2018-03-23
2018-03-23 01:00:00 234599.89 25.348257 7.800 51.300 0.642777 0.989539 2018-03-23
2018-03-23 02:00:00 249264.88 17.562606 7.300 54.900 1.823513 1.137605 2018-03-23
2018-03-23 03:00:00 229360.17 20.513299 7.500 52.700 1.650563 1.136867 2018-03-23
2018-03-23 04:00:00 236416.45 15.354987 7.200 55.100 1.379341 1.128551 2018-03-23
... ... ... ... ... ... ... ...
2022-01-26 19:00:00 255639.10 1.859704 15.719 36.720 1.588630 0.932774 2022-01-26
2022-01-26 20:00:00 253412.80 1.449961 15.580 36.812 1.587185 0.933383 2022-01-26
2022-01-26 21:00:00 261648.90 1.853027 15.595 36.948 1.597107 0.933327 2022-01-26
2022-01-26 22:00:00 271429.70 1.723446 15.532 37.160 1.581778 0.934369 2022-01-26
2022-01-26 23:00:00 272750.00 1.374762 15.435 37.279 1.560663 0.934365 2022-01-26

31296 rows × 7 columns

In [20]:
hourly_data_fixed = (hourly_data_ffill[num_cols] + hourly_data_ffill[num_cols]) / 2
In [21]:
hourly_data_fixed['days'] = hourly_data_fixed.index.astype(str).to_series().apply(lambda x:x.split(' ')[0]).values
hourly_data_fixed[hourly_data_fixed.index=='2020-12-01 12:00:00']
Out[21]:
flow rNOx rO2 temp rSO2 rsmoke days
2020-12-01 12:00:00 49432.59 1.35584 19.1 34.7 0.124794 0.490263 2020-12-01

特征工程将每天每小时的数据平铺开作为当天的24*6个特征

In [22]:
feature_cols = [f"{x}_{y}" for x in range(24) for y in num_cols]
feature_cols
Out[22]:
['0_flow',
 '0_rNOx',
 '0_rO2',
 '0_temp',
 '0_rSO2',
 '0_rsmoke',
 '1_flow',
 '1_rNOx',
 '1_rO2',
 '1_temp',
 '1_rSO2',
 '1_rsmoke',
 '2_flow',
 '2_rNOx',
 '2_rO2',
 '2_temp',
 '2_rSO2',
 '2_rsmoke',
 '3_flow',
 '3_rNOx',
 '3_rO2',
 '3_temp',
 '3_rSO2',
 '3_rsmoke',
 '4_flow',
 '4_rNOx',
 '4_rO2',
 '4_temp',
 '4_rSO2',
 '4_rsmoke',
 '5_flow',
 '5_rNOx',
 '5_rO2',
 '5_temp',
 '5_rSO2',
 '5_rsmoke',
 '6_flow',
 '6_rNOx',
 '6_rO2',
 '6_temp',
 '6_rSO2',
 '6_rsmoke',
 '7_flow',
 '7_rNOx',
 '7_rO2',
 '7_temp',
 '7_rSO2',
 '7_rsmoke',
 '8_flow',
 '8_rNOx',
 '8_rO2',
 '8_temp',
 '8_rSO2',
 '8_rsmoke',
 '9_flow',
 '9_rNOx',
 '9_rO2',
 '9_temp',
 '9_rSO2',
 '9_rsmoke',
 '10_flow',
 '10_rNOx',
 '10_rO2',
 '10_temp',
 '10_rSO2',
 '10_rsmoke',
 '11_flow',
 '11_rNOx',
 '11_rO2',
 '11_temp',
 '11_rSO2',
 '11_rsmoke',
 '12_flow',
 '12_rNOx',
 '12_rO2',
 '12_temp',
 '12_rSO2',
 '12_rsmoke',
 '13_flow',
 '13_rNOx',
 '13_rO2',
 '13_temp',
 '13_rSO2',
 '13_rsmoke',
 '14_flow',
 '14_rNOx',
 '14_rO2',
 '14_temp',
 '14_rSO2',
 '14_rsmoke',
 '15_flow',
 '15_rNOx',
 '15_rO2',
 '15_temp',
 '15_rSO2',
 '15_rsmoke',
 '16_flow',
 '16_rNOx',
 '16_rO2',
 '16_temp',
 '16_rSO2',
 '16_rsmoke',
 '17_flow',
 '17_rNOx',
 '17_rO2',
 '17_temp',
 '17_rSO2',
 '17_rsmoke',
 '18_flow',
 '18_rNOx',
 '18_rO2',
 '18_temp',
 '18_rSO2',
 '18_rsmoke',
 '19_flow',
 '19_rNOx',
 '19_rO2',
 '19_temp',
 '19_rSO2',
 '19_rsmoke',
 '20_flow',
 '20_rNOx',
 '20_rO2',
 '20_temp',
 '20_rSO2',
 '20_rsmoke',
 '21_flow',
 '21_rNOx',
 '21_rO2',
 '21_temp',
 '21_rSO2',
 '21_rsmoke',
 '22_flow',
 '22_rNOx',
 '22_rO2',
 '22_temp',
 '22_rSO2',
 '22_rsmoke',
 '23_flow',
 '23_rNOx',
 '23_rO2',
 '23_temp',
 '23_rSO2',
 '23_rsmoke']
In [23]:
daily_emiss_data = hourly_data_fixed.groupby('days').agg(list)
daily_emiss_data
Out[23]:
flow rNOx rO2 temp rSO2 rsmoke
days
2018-03-23 [244136.19, 234599.89, 249264.88, 229360.17, 2... [24.212547548922526, 25.348256763471422, 17.56... [7.7, 7.8, 7.3, 7.5, 7.2, 7.7, 7.3, 7.0, 6.2, ... [51.4, 51.3, 54.9, 52.7, 55.1, 51.9, 51.7, 52.... [0.7524938826881673, 0.642777189290924, 1.8235... [1.0145984935121357, 0.9895385677241856, 1.137...
2018-03-24 [234070.07, 235778.62, 231371.79, 234002.61, 2... [21.236174270928696, 21.840986084108295, 21.63... [7.4, 7.5, 7.4, 7.3, 7.5, 7.6, 7.5, 6.8, 5.7, ... [51.3, 51.6, 51.7, 52.0, 52.0, 51.9, 51.7, 52.... [0.2791006216657959, 0.287292450812319, 0.2956... [0.92187781095672, 0.8956764642972298, 0.92074...
2018-03-25 [228939.37, 232613.69, 229586.17, 235035.84, 2... [17.854300390828886, 18.93070026209273, 18.963... [7.4, 7.6, 7.8, 8.1, 8.0, 7.6, 7.4, 6.9, 5.9, ... [52.6, 52.4, 52.3, 52.2, 52.0, 53.3, 55.5, 53.... [5.332282211556822, 3.009154456372548, 4.68796... [0.8255350027370751, 0.8260423997885425, 0.809...
2018-03-26 [231112.06, 225984.7, 224547.59, 221822.01, 21... [25.934359751728444, 21.81253685318773, 21.535... [7.5, 7.4, 7.5, 7.5, 7.5, 7.1, 7.0, 7.0, 5.7, ... [52.4, 52.3, 52.2, 52.6, 52.4, 54.1, 55.2, 53.... [2.0988220157892563, 1.7874981882009273, 1.796... [0.9271904487422418, 0.944338665464641, 0.9108...
2018-03-27 [226140.95, 219510.17, 215491.43, 205450.35, 2... [11.968345200357508, 21.178438475476018, 26.25... [6.9, 7.4, 7.5, 7.6, 7.6, 7.2, 7.3, 6.6, 5.7, ... [55.7, 53.0, 52.0, 52.0, 51.8, 52.2, 52.3, 53.... [0.21695389751746164, 0.3281261324263327, 0.39... [1.1014582489348053, 1.0853402841794082, 0.978...
... ... ... ... ... ... ...
2022-01-22 [217544.3, 223416.0, 221987.6, 216571.4, 21647... [3.045354822347651, 2.8669384829826425, 3.2684... [7.756, 7.856, 7.885, 8.05, 8.674, 8.493, 8.32... [55.539, 55.741, 55.812, 55.564, 55.491, 54.76... [0.6289737165296122, 0.6122342875306045, 0.625... [1.0424734269053981, 1.0191389782210867, 1.039...
2022-01-23 [204086.4, 213480.5, 207928.1, 210432.4, 20739... [0.5965432853216215, 0.7573690670940121, 0.493... [7.43, 8.015, 7.812, 7.805, 7.938, 8.12, 8.161... [55.836, 55.577, 55.237, 55.573, 55.512, 54.88... [0.5333190389186867, 0.5314856001529347, 0.561... [1.0451184794574826, 1.022569279389579, 1.0344...
2022-01-24 [196331.5, 204396.6, 209247.7, 208345.4, 20784... [11.031038246691466, 11.518845457518038, 13.18... [9.172, 9.131, 8.842, 8.987, 9.201, 9.489, 9.1... [52.855, 53.409, 53.889, 54.006, 53.944, 53.07... [0.8258148706118134, 0.8324800807378938, 0.886... [1.018401194529847, 0.9991105440570864, 1.0109...
2022-01-25 [241509.4, 251172.7, 222005.2, 216005.8, 21869... [9.760638960096678, 9.68924708625155, 13.61718... [12.177, 11.365, 10.113, 10.497, 10.501, 10.84... [49.297, 48.991, 50.58, 50.893, 50.853, 50.416... [0.819102568494695, 0.8377693046259155, 0.9752... [0.981306152448034, 0.9660540603723952, 0.9886...
2022-01-26 [263819.8, 263171.3, 260461.9, 257509.9, 25479... [3.085553795959118, 2.6547312605083198, 2.1764... [15.38, 15.623, 15.897, 15.944, 15.919, 15.777... [37.445, 37.231, 36.495, 35.936, 35.819, 35.78... [0.5047465592805648, 0.5081007278163424, 0.497... [0.9335116655624799, 0.9270826921667246, 0.929...

1304 rows × 6 columns

In [24]:
def merge(x1, x2, x3, x4, x5, x6):
    return sum([x1, x2, x3, x4, x5, x6], [])
In [25]:
daily_emiss_data.columns
Out[25]:
Index(['flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke'], dtype='object')
In [26]:
daily_emiss_data['feature_cols'] = daily_emiss_data.apply(lambda row: merge(row['flow'], row['rNOx'], row['rO2'], row['temp'], row['rSO2'], row['rsmoke']), axis=1)
train_hourly_data = pd.DataFrame.from_records(np.array(daily_emiss_data['feature_cols'].values))
train_hourly_data.set_index(daily_emiss_data.index, inplace=True)
train_hourly_data.columns = feature_cols
train_hourly_data
Out[26]:
0_flow 0_rNOx 0_rO2 0_temp 0_rSO2 0_rsmoke 1_flow 1_rNOx 1_rO2 1_temp ... 22_rO2 22_temp 22_rSO2 22_rsmoke 23_flow 23_rNOx 23_rO2 23_temp 23_rSO2 23_rsmoke
days
2018-03-23 244136.19 234599.89 249264.88 229360.17 236416.45 236113.88 243835.88 254941.06 263172.44 265048.62 ... 1.108454 1.149077 1.116597 1.070484 0.965953 1.005916 1.013989 0.994012 0.978667 0.945210
2018-03-24 234070.07 235778.62 231371.79 234002.61 224972.48 212372.97 227885.84 252032.30 257109.81 252191.47 ... 1.139022 1.155038 1.161774 1.112287 0.960906 0.885317 0.892926 0.885317 0.870327 0.800264
2018-03-25 228939.37 232613.69 229586.17 235035.84 227862.00 233114.53 224467.42 252500.03 240797.89 247235.60 ... 0.924819 0.975402 1.093083 0.968145 0.934758 0.900796 0.934184 0.954531 0.960316 0.926621
2018-03-26 231112.06 225984.70 224547.59 221822.01 219699.59 216640.45 228010.51 255410.68 260558.74 253688.93 ... 0.849763 0.929620 0.984937 0.971792 0.953290 0.882604 0.943758 0.960906 0.995540 1.021165
2018-03-27 226140.95 219510.17 215491.43 205450.35 209391.39 202650.57 207802.10 247740.44 254831.56 251766.47 ... 1.292986 1.344673 1.348917 1.185938 1.146697 1.155129 1.088680 1.045839 0.937636 0.894574
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-01-22 217544.30 223416.00 221987.60 216571.40 216474.30 217356.80 215403.80 216200.90 224753.10 226410.50 ... 1.036403 1.037442 1.039126 1.045166 1.045001 1.037889 1.040583 1.039620 1.038964 1.041814
2022-01-23 204086.40 213480.50 207928.10 210432.40 207398.30 205233.10 202121.80 203224.70 218894.90 238739.10 ... 1.024024 1.032411 1.035073 1.033276 1.035860 1.033211 1.031058 1.025498 1.023998 1.022248
2022-01-24 196331.50 204396.60 209247.70 208345.40 207840.00 203811.20 204427.60 208614.90 204340.80 206945.40 ... 1.007186 1.008622 1.012684 1.021570 1.022493 1.015565 1.008908 0.994918 0.986704 0.987378
2022-01-25 241509.40 251172.70 222005.20 216005.80 218697.30 217854.50 198460.70 199237.90 203870.10 224508.80 ... 0.936690 0.932645 0.935892 0.934892 0.933680 0.932380 0.928861 0.928384 0.929663 0.932935
2022-01-26 263819.80 263171.30 260461.90 257509.90 254797.00 255295.30 255801.90 261359.50 267883.50 271961.30 ... 0.929660 0.929660 0.929660 0.929507 0.931257 0.932774 0.933383 0.933327 0.934369 0.934365

1304 rows × 144 columns

In [27]:
hourly_data_fixed['cNOx'] = hourly_data_fixed.flow * hourly_data_fixed.rNOx
hourly_data_fixed['cO2'] = hourly_data_fixed.flow * hourly_data_fixed.rO2
hourly_data_fixed['cSO2'] = hourly_data_fixed.flow * hourly_data_fixed.rSO2
hourly_data_fixed['csmoke'] = hourly_data_fixed.flow * hourly_data_fixed.rsmoke
In [28]:
grp = hourly_data_fixed.groupby('days')
new_data = grp.agg({'cNOx': sum, 'cSO2':sum, 'cO2':sum, 'csmoke':sum,
                    'flow':np.mean, 'rNOx':np.mean, 'rO2':np.mean, 'temp':np.mean,'rSO2':np.mean, 'rsmoke':np.mean}).reset_index()
new_data.shape
Out[28]:
(1304, 11)
In [29]:
final_hourly_data = train_hourly_data.reset_index().merge(new_data, how='left', on='days')
final_hourly_data.shape
Out[29]:
(1304, 155)
In [30]:
daily_data.shape
Out[30]:
(1108, 44)
In [31]:
import seaborn as sns
from scipy.stats import norm
In [32]:
sns.distplot(daily_data['燃料消耗量(吨)'], fit=norm)
C:\Users\zhaojh\Miniconda3\envs\py38\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[32]:
<AxesSubplot:xlabel='燃料消耗量(吨)', ylabel='Density'>
No description has been provided for this image
In [33]:
fig = plt.figure(figsize=(40, 40))
for index, col in enumerate(daily_data.columns):
    if col != 'days':
        # u = x_data[col].mean()
        # std = x_data[col].std()
        try:
            plt.subplot(9,5,index+1)
            plt.title(col)
            plt.hist(daily_data[col])
        except:
            print(col)
No description has been provided for this image
In [34]:
use_cols = ['days']
for col in daily_data.columns:
    if col != 'days':
        if daily_data[col].value_counts().shape[0] > 1 and daily_data[col].isna().sum() / daily_data.shape[0] <= 0.01:
            use_cols.append(col)
In [35]:
use_cols
Out[35]:
['days',
 '发电量(千瓦时)',
 '供热量(吉焦)',
 '机组运行时间(小时)',
 '硫分(%',
 '脱硫剂使用量(吨)',
 '脱硫设施运行时间(小时)',
 '脱硝还原剂消耗量(吨)',
 '脱硝运行时间(小时)',
 '燃料消耗量(吨)']
In [36]:
tmp_data = daily_data[use_cols].copy()
tmp_data.shape
Out[36]:
(1108, 10)
In [37]:
train_data = tmp_data.merge(final_hourly_data, on='days', how='left')
In [38]:
train_data.shape
Out[38]:
(1108, 164)
In [39]:
train_data['机组运行时间(小时)'].value_counts()
Out[39]:
24.0    1103
21.0       2
0.0        1
15.5       1
19.0       1
Name: 机组运行时间(小时), dtype: int64
In [40]:
valid_data = train_data[~((train_data['机组运行时间(小时)']==0)|(train_data['燃料消耗量(吨)']<=200)|(train_data.flow==0)|(train_data.flow.isna()))].copy()
valid_data.shape
Out[40]:
(1087, 164)
In [41]:
import datetime as dt
In [42]:
def cal_timedelta(x):
    date = dt.datetime.strptime(x, '%Y-%m-%d')
    date = dt.date(date.year, date.month, date.day)
    start_date = dt.date(date.year, 1, 1)
    time_delta = (date - start_date).days
    return time_delta
In [43]:
valid_data['day_of_year'] = valid_data.days.apply(cal_timedelta)
valid_data
Out[43]:
days 发电量(千瓦时) 供热量(吉焦) 机组运行时间(小时) 硫分(% 脱硫剂使用量(吨) 脱硫设施运行时间(小时) 脱硝还原剂消耗量(吨) 脱硝运行时间(小时) 燃料消耗量(吨) ... cSO2 cO2 csmoke flow rNOx rO2 temp rSO2 rsmoke day_of_year
0 2018-10-01 156796.00 6536.83 24.0 0.51 5.06 24.0 2.98 24.0 323 ... 1.810937e+07 3.745944e+07 5.495410e+05 162345.192917 24.417792 9.900000 51.250000 4.705029 0.182338 273
1 2018-10-02 133984.00 2484.64 24.0 0.51 5.04 24.0 2.97 24.0 218 ... 1.337057e+07 2.832146e+07 3.078217e+05 140175.330833 18.705945 9.400000 50.679167 3.675542 0.166718 274
2 2018-10-03 134023.00 3020.83 24.0 0.51 5.04 24.0 2.95 24.0 212 ... 2.404455e+07 3.174159e+07 4.348207e+05 154686.184167 20.891791 8.550000 52.808333 6.440365 0.117143 275
3 2018-10-04 124765.00 5599.23 24.0 0.51 5.03 24.0 2.98 24.0 223 ... 8.668474e+06 2.511504e+07 1.946970e+06 120345.545833 18.457892 10.202083 48.854167 2.364306 0.761071 276
4 2018-10-05 134414.00 4702.65 24.0 0.51 5.06 24.0 3.01 24.0 243 ... 1.579668e+06 4.106346e+07 5.390776e+06 162533.103542 22.017321 11.497917 45.783333 0.339330 1.858999 277
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1103 2022-01-22 52.24 12472.00 24.0 0.59 8.46 24.0 4.56 24.0 822 ... 3.028139e+06 4.149625e+07 5.438282e+06 218349.604167 2.174717 7.921417 55.441542 0.576979 1.037822 21
1104 2022-01-23 51.36 12051.00 24.0 0.59 8.46 24.0 4.58 24.0 790 ... 3.412421e+06 4.422277e+07 5.194162e+06 210121.608333 5.565075 8.756333 54.574333 0.678481 1.030052 22
1105 2022-01-24 51.12 11276.00 24.0 0.59 8.43 24.0 4.57 24.0 751 ... 4.146250e+06 4.655727e+07 5.133802e+06 211378.329167 10.326585 9.110167 53.031042 0.818827 1.012412 23
1106 2022-01-25 49.32 11007.00 24.0 0.59 8.43 24.0 4.56 24.0 672 ... 3.971702e+06 7.959093e+07 5.497492e+06 240801.208333 4.874698 13.636042 42.908458 0.698443 0.953106 24
1107 2022-01-26 29.64 8132.00 24.0 0.59 8.44 24.0 4.57 24.0 484 ... 5.050733e+06 9.866431e+07 5.879454e+06 263197.579167 1.481812 15.621583 36.412917 0.801224 0.930781 25

1087 rows × 165 columns

In [44]:
valid_data.to_csv(
    './train_data.csv', encoding='utf-8-sig', index=False
)
In [ ]: