520 KiB
520 KiB
In [1]:
import pandas as pd import numpy as np import matplotlib.pyplot as plt #新增加的两行 from pylab import mpl # 设置显示中文字体 mpl.rcParams["font.sans-serif"] = ["SimHei"] mpl.rcParams["axes.unicode_minus"] = False
In [2]:
daily_data = pd.read_excel('./data/机器学习样表.xlsx',sheet_name=0, header=[0, 1]) old_cols = daily_data.columns new_cols = [x[0].strip() if 'Unnamed' in x[1] else x[0]+'_'+x[1] for x in old_cols] daily_data.columns = new_cols daily_data.head()
Out[2]:
日期 | 企业名称 | 地址 | 省份 | 经度 | 纬度 | 烟囱高度(m) | 脱硝工艺 | 脱硝剂名称 | 脱硝设备数量 | ... | 供热量(吉焦) | 产渣量(吨) | 机组运行时间(小时) | 硫分(%) | 脱硫副产品产量(吨) | 脱硫剂使用量(吨) | 脱硫设施运行时间(小时) | 脱硝还原剂消耗量(吨) | 脱硝运行时间(小时) | 燃料消耗量(吨) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-10-01 | 浙江秀舟热电有限公司 | 嘉兴市南湖区凤桥镇 | 浙江省 | 120°51′5.54″ | 30°39′14.76″ | 80 | SNCR SCR | 氨水 | 3 | ... | 6536.83 | NaN | 24.0 | 0.51 | NaN | 5.06 | 24.0 | 2.98 | 24.0 | 323 |
1 | 2018-10-02 | 浙江秀舟热电有限公司 | 嘉兴市南湖区凤桥镇 | 浙江省 | 120°51′5.54″ | 30°39′14.76″ | 80 | SNCR SCR | 氨水 | 3 | ... | 2484.64 | NaN | 24.0 | 0.51 | NaN | 5.04 | 24.0 | 2.97 | 24.0 | 218 |
2 | 2018-10-03 | 浙江秀舟热电有限公司 | 嘉兴市南湖区凤桥镇 | 浙江省 | 120°51′5.54″ | 30°39′14.76″ | 80 | SNCR SCR | 氨水 | 3 | ... | 3020.83 | NaN | 24.0 | 0.51 | NaN | 5.04 | 24.0 | 2.95 | 24.0 | 212 |
3 | 2018-10-04 | 浙江秀舟热电有限公司 | 嘉兴市南湖区凤桥镇 | 浙江省 | 120°51′5.54″ | 30°39′14.76″ | 80 | SNCR SCR | 氨水 | 3 | ... | 5599.23 | NaN | 24.0 | 0.51 | 72.52 | 5.03 | 24.0 | 2.98 | 24.0 | 223 |
4 | 2018-10-05 | 浙江秀舟热电有限公司 | 嘉兴市南湖区凤桥镇 | 浙江省 | 120°51′5.54″ | 30°39′14.76″ | 80 | SNCR SCR | 氨水 | 3 | ... | 4702.65 | NaN | 24.0 | 0.51 | NaN | 5.06 | 24.0 | 3.01 | 24.0 | 243 |
5 rows × 44 columns
In [3]:
daily_data.rename(columns={"日期": "days"}, inplace=True) daily_data.days = daily_data.days.astype(str)
In [4]:
daily_data.shape
Out[4]:
(1178, 44)
In [5]:
hourly_data = pd.read_excel('./data/机器学习样表.xlsx',sheet_name=1).drop_duplicates() hourly_data.columns = ['date', 'flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke'] hourly_data.date = hourly_data.date.astype("datetime64") ori_hourly_data = hourly_data.copy() hourly_data
Out[5]:
date | flow | rNOx | rO2 | temp | rSO2 | rsmoke | |
---|---|---|---|---|---|---|---|
0 | 2018-03-23 00:00:00 | 244136.19 | 28.6370 | 7.700 | 51.400 | 0.8900 | 1.2000 |
1 | 2018-03-23 01:00:00 | 234599.89 | 29.9710 | 7.800 | 51.300 | 0.7600 | 1.1700 |
2 | 2018-03-23 02:00:00 | 249264.88 | 20.9960 | 7.300 | 54.900 | 2.1800 | 1.3600 |
3 | 2018-03-23 03:00:00 | 229360.17 | 24.3590 | 7.500 | 52.700 | 1.9600 | 1.3500 |
4 | 2018-03-23 04:00:00 | 236416.45 | 18.3680 | 7.200 | 55.100 | 1.6500 | 1.3500 |
... | ... | ... | ... | ... | ... | ... | ... |
33714 | 2022-01-26 19:00:00 | 255639.10 | 2.1000 | 15.719 | 36.720 | 1.7939 | 1.0533 |
33715 | 2022-01-26 20:00:00 | 253412.80 | 1.6378 | 15.580 | 36.812 | 1.7928 | 1.0543 |
33716 | 2022-01-26 21:00:00 | 261648.90 | 2.0940 | 15.595 | 36.948 | 1.8048 | 1.0547 |
33717 | 2022-01-26 22:00:00 | 271429.70 | 1.9489 | 15.532 | 37.160 | 1.7887 | 1.0566 |
33718 | 2022-01-26 23:00:00 | 272750.00 | 1.5552 | 15.435 | 37.279 | 1.7655 | 1.0570 |
33715 rows × 7 columns
In [7]:
hourly_data['rNOx'] = hourly_data.apply((lambda x: x['rNOx'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1) hourly_data['rSO2'] = hourly_data.apply((lambda x: x['rSO2'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1) hourly_data['rsmoke'] = hourly_data.apply((lambda x: x['rsmoke'] * 101800 * 273 / (101325 * (273 + x['temp']))), axis=1)
我们将每天24个小时的数据作为特征,因此对数据不足24小时的,要先填充Nan,以备后续处理
In [8]:
hour_range = pd.date_range(hourly_data.date.min(), hourly_data.date.max(), freq='H') hour_range.shape[0], hourly_data.shape[0]
Out[8]:
(33744, 33715)
可见少了约30条数据,因此进行index对齐
In [9]:
hourly_data = hourly_data.set_index("date").reindex(hour_range) hourly_data['days'] = hourly_data.index.astype(str).to_series().apply(lambda x:x.split(' ')[0]).values
异常值处理¶
对于出现的负值,若一天之中出现的较多,这一天的数据视为脏数据,可以统一处理
In [10]:
hourly_data[hourly_data.rNOx < 0].iloc[0]
Out[10]:
flow 49432.59 rNOx -0.338727 rO2 19.1 temp 34.7 rSO2 0.124794 rsmoke 0.490263 days 2020-12-01 Name: 2020-12-01 12:00:00, dtype: object
In [11]:
hourly_data[hourly_data._get_numeric_data() < 0] = np.nan hourly_data[hourly_data.index=='2020-12-01 12:00:00']
Out[11]:
flow | rNOx | rO2 | temp | rSO2 | rsmoke | days | |
---|---|---|---|---|---|---|---|
2020-12-01 12:00:00 | 49432.59 | NaN | 19.1 | 34.7 | 0.124794 | 0.490263 | 2020-12-01 |
缺失值分析¶
In [12]:
# 特征列 num_cols = [x for x in hourly_data.columns if not x.startswith('da')] num_cols
Out[12]:
['flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke']
In [13]:
# 写一个逻辑,首先统计出每天有缺失数据的比例,对任一数据缺失记录高于4条的,判断该天在daily_data是否有生产记录,若无,则为脏数据,需要删去 del_date = list() for col in num_cols: na_counts = hourly_data[hourly_data[col].isna()].days.value_counts().to_dict() for date in na_counts: if na_counts.get(date) < 4: continue try: if date in del_date: continue if daily_data[daily_data.days==date].shape[0] == 0: del_date.append(date) continue if daily_data[daily_data.days==date]['发电量(千瓦时)'].values[0] > 100000 or daily_data[daily_data.days==date]['供热量(吉焦)'].values[0] > 2500: # 取缺失率高且有较大发电量的天作为删除值 del_date.append(date) except: print(date)
In [14]:
len(del_date)
Out[14]:
101
In [15]:
# 删掉不要的脏数据 hourly_data = hourly_data[~hourly_data.days.isin(del_date)].copy() daily_data = daily_data[~daily_data.days.isin(del_date)].copy() hourly_data.shape, daily_data.shape
Out[15]:
((31320, 7), (1108, 44))
最后看一下有无哪一天仍有很多缺失数据
In [16]:
hourly_data[hourly_data[num_cols].isnull().T.any()].days.value_counts()
Out[16]:
2019-04-08 22 2019-07-17 3 2018-12-19 3 2021-02-18 3 2019-01-16 3 .. 2019-03-12 1 2019-03-17 1 2019-03-27 1 2019-04-04 1 2019-05-24 1 Name: days, Length: 220, dtype: int64
于是再去掉2019-04-08的数据
In [17]:
hourly_data = hourly_data[hourly_data.days!='2019-04-08'].copy() hourly_data.shape
Out[17]:
(31296, 7)
缺失值补充¶
- 对Nan值 取其上下两个时刻的均值作为填充值
- 对于多个连续缺失的值,实际上应该用窗口法填充,但是难度太大,因此仍用均值填充
- 使用ffill和bfill,然后合并取均值。
In [18]:
hourly_data_ffill = hourly_data.ffill() hourly_data_ffill
Out[18]:
flow | rNOx | rO2 | temp | rSO2 | rsmoke | days | |
---|---|---|---|---|---|---|---|
2018-03-23 00:00:00 | 244136.19 | 24.212548 | 7.700 | 51.400 | 0.752494 | 1.014598 | 2018-03-23 |
2018-03-23 01:00:00 | 234599.89 | 25.348257 | 7.800 | 51.300 | 0.642777 | 0.989539 | 2018-03-23 |
2018-03-23 02:00:00 | 249264.88 | 17.562606 | 7.300 | 54.900 | 1.823513 | 1.137605 | 2018-03-23 |
2018-03-23 03:00:00 | 229360.17 | 20.513299 | 7.500 | 52.700 | 1.650563 | 1.136867 | 2018-03-23 |
2018-03-23 04:00:00 | 236416.45 | 15.354987 | 7.200 | 55.100 | 1.379341 | 1.128551 | 2018-03-23 |
... | ... | ... | ... | ... | ... | ... | ... |
2022-01-26 19:00:00 | 255639.10 | 1.859704 | 15.719 | 36.720 | 1.588630 | 0.932774 | 2022-01-26 |
2022-01-26 20:00:00 | 253412.80 | 1.449961 | 15.580 | 36.812 | 1.587185 | 0.933383 | 2022-01-26 |
2022-01-26 21:00:00 | 261648.90 | 1.853027 | 15.595 | 36.948 | 1.597107 | 0.933327 | 2022-01-26 |
2022-01-26 22:00:00 | 271429.70 | 1.723446 | 15.532 | 37.160 | 1.581778 | 0.934369 | 2022-01-26 |
2022-01-26 23:00:00 | 272750.00 | 1.374762 | 15.435 | 37.279 | 1.560663 | 0.934365 | 2022-01-26 |
31296 rows × 7 columns
In [19]:
hourly_data_bfill = hourly_data.bfill() hourly_data_bfill
Out[19]:
flow | rNOx | rO2 | temp | rSO2 | rsmoke | days | |
---|---|---|---|---|---|---|---|
2018-03-23 00:00:00 | 244136.19 | 24.212548 | 7.700 | 51.400 | 0.752494 | 1.014598 | 2018-03-23 |
2018-03-23 01:00:00 | 234599.89 | 25.348257 | 7.800 | 51.300 | 0.642777 | 0.989539 | 2018-03-23 |
2018-03-23 02:00:00 | 249264.88 | 17.562606 | 7.300 | 54.900 | 1.823513 | 1.137605 | 2018-03-23 |
2018-03-23 03:00:00 | 229360.17 | 20.513299 | 7.500 | 52.700 | 1.650563 | 1.136867 | 2018-03-23 |
2018-03-23 04:00:00 | 236416.45 | 15.354987 | 7.200 | 55.100 | 1.379341 | 1.128551 | 2018-03-23 |
... | ... | ... | ... | ... | ... | ... | ... |
2022-01-26 19:00:00 | 255639.10 | 1.859704 | 15.719 | 36.720 | 1.588630 | 0.932774 | 2022-01-26 |
2022-01-26 20:00:00 | 253412.80 | 1.449961 | 15.580 | 36.812 | 1.587185 | 0.933383 | 2022-01-26 |
2022-01-26 21:00:00 | 261648.90 | 1.853027 | 15.595 | 36.948 | 1.597107 | 0.933327 | 2022-01-26 |
2022-01-26 22:00:00 | 271429.70 | 1.723446 | 15.532 | 37.160 | 1.581778 | 0.934369 | 2022-01-26 |
2022-01-26 23:00:00 | 272750.00 | 1.374762 | 15.435 | 37.279 | 1.560663 | 0.934365 | 2022-01-26 |
31296 rows × 7 columns
In [20]:
hourly_data_fixed = (hourly_data_ffill[num_cols] + hourly_data_ffill[num_cols]) / 2
In [21]:
hourly_data_fixed['days'] = hourly_data_fixed.index.astype(str).to_series().apply(lambda x:x.split(' ')[0]).values hourly_data_fixed[hourly_data_fixed.index=='2020-12-01 12:00:00']
Out[21]:
flow | rNOx | rO2 | temp | rSO2 | rsmoke | days | |
---|---|---|---|---|---|---|---|
2020-12-01 12:00:00 | 49432.59 | 1.35584 | 19.1 | 34.7 | 0.124794 | 0.490263 | 2020-12-01 |
特征工程,将每天每小时的数据平铺开作为当天的24*6个特征¶
In [22]:
feature_cols = [f"{x}_{y}" for x in range(24) for y in num_cols] feature_cols
Out[22]:
['0_flow', '0_rNOx', '0_rO2', '0_temp', '0_rSO2', '0_rsmoke', '1_flow', '1_rNOx', '1_rO2', '1_temp', '1_rSO2', '1_rsmoke', '2_flow', '2_rNOx', '2_rO2', '2_temp', '2_rSO2', '2_rsmoke', '3_flow', '3_rNOx', '3_rO2', '3_temp', '3_rSO2', '3_rsmoke', '4_flow', '4_rNOx', '4_rO2', '4_temp', '4_rSO2', '4_rsmoke', '5_flow', '5_rNOx', '5_rO2', '5_temp', '5_rSO2', '5_rsmoke', '6_flow', '6_rNOx', '6_rO2', '6_temp', '6_rSO2', '6_rsmoke', '7_flow', '7_rNOx', '7_rO2', '7_temp', '7_rSO2', '7_rsmoke', '8_flow', '8_rNOx', '8_rO2', '8_temp', '8_rSO2', '8_rsmoke', '9_flow', '9_rNOx', '9_rO2', '9_temp', '9_rSO2', '9_rsmoke', '10_flow', '10_rNOx', '10_rO2', '10_temp', '10_rSO2', '10_rsmoke', '11_flow', '11_rNOx', '11_rO2', '11_temp', '11_rSO2', '11_rsmoke', '12_flow', '12_rNOx', '12_rO2', '12_temp', '12_rSO2', '12_rsmoke', '13_flow', '13_rNOx', '13_rO2', '13_temp', '13_rSO2', '13_rsmoke', '14_flow', '14_rNOx', '14_rO2', '14_temp', '14_rSO2', '14_rsmoke', '15_flow', '15_rNOx', '15_rO2', '15_temp', '15_rSO2', '15_rsmoke', '16_flow', '16_rNOx', '16_rO2', '16_temp', '16_rSO2', '16_rsmoke', '17_flow', '17_rNOx', '17_rO2', '17_temp', '17_rSO2', '17_rsmoke', '18_flow', '18_rNOx', '18_rO2', '18_temp', '18_rSO2', '18_rsmoke', '19_flow', '19_rNOx', '19_rO2', '19_temp', '19_rSO2', '19_rsmoke', '20_flow', '20_rNOx', '20_rO2', '20_temp', '20_rSO2', '20_rsmoke', '21_flow', '21_rNOx', '21_rO2', '21_temp', '21_rSO2', '21_rsmoke', '22_flow', '22_rNOx', '22_rO2', '22_temp', '22_rSO2', '22_rsmoke', '23_flow', '23_rNOx', '23_rO2', '23_temp', '23_rSO2', '23_rsmoke']
In [23]:
daily_emiss_data = hourly_data_fixed.groupby('days').agg(list) daily_emiss_data
Out[23]:
flow | rNOx | rO2 | temp | rSO2 | rsmoke | |
---|---|---|---|---|---|---|
days | ||||||
2018-03-23 | [244136.19, 234599.89, 249264.88, 229360.17, 2... | [24.212547548922526, 25.348256763471422, 17.56... | [7.7, 7.8, 7.3, 7.5, 7.2, 7.7, 7.3, 7.0, 6.2, ... | [51.4, 51.3, 54.9, 52.7, 55.1, 51.9, 51.7, 52.... | [0.7524938826881673, 0.642777189290924, 1.8235... | [1.0145984935121357, 0.9895385677241856, 1.137... |
2018-03-24 | [234070.07, 235778.62, 231371.79, 234002.61, 2... | [21.236174270928696, 21.840986084108295, 21.63... | [7.4, 7.5, 7.4, 7.3, 7.5, 7.6, 7.5, 6.8, 5.7, ... | [51.3, 51.6, 51.7, 52.0, 52.0, 51.9, 51.7, 52.... | [0.2791006216657959, 0.287292450812319, 0.2956... | [0.92187781095672, 0.8956764642972298, 0.92074... |
2018-03-25 | [228939.37, 232613.69, 229586.17, 235035.84, 2... | [17.854300390828886, 18.93070026209273, 18.963... | [7.4, 7.6, 7.8, 8.1, 8.0, 7.6, 7.4, 6.9, 5.9, ... | [52.6, 52.4, 52.3, 52.2, 52.0, 53.3, 55.5, 53.... | [5.332282211556822, 3.009154456372548, 4.68796... | [0.8255350027370751, 0.8260423997885425, 0.809... |
2018-03-26 | [231112.06, 225984.7, 224547.59, 221822.01, 21... | [25.934359751728444, 21.81253685318773, 21.535... | [7.5, 7.4, 7.5, 7.5, 7.5, 7.1, 7.0, 7.0, 5.7, ... | [52.4, 52.3, 52.2, 52.6, 52.4, 54.1, 55.2, 53.... | [2.0988220157892563, 1.7874981882009273, 1.796... | [0.9271904487422418, 0.944338665464641, 0.9108... |
2018-03-27 | [226140.95, 219510.17, 215491.43, 205450.35, 2... | [11.968345200357508, 21.178438475476018, 26.25... | [6.9, 7.4, 7.5, 7.6, 7.6, 7.2, 7.3, 6.6, 5.7, ... | [55.7, 53.0, 52.0, 52.0, 51.8, 52.2, 52.3, 53.... | [0.21695389751746164, 0.3281261324263327, 0.39... | [1.1014582489348053, 1.0853402841794082, 0.978... |
... | ... | ... | ... | ... | ... | ... |
2022-01-22 | [217544.3, 223416.0, 221987.6, 216571.4, 21647... | [3.045354822347651, 2.8669384829826425, 3.2684... | [7.756, 7.856, 7.885, 8.05, 8.674, 8.493, 8.32... | [55.539, 55.741, 55.812, 55.564, 55.491, 54.76... | [0.6289737165296122, 0.6122342875306045, 0.625... | [1.0424734269053981, 1.0191389782210867, 1.039... |
2022-01-23 | [204086.4, 213480.5, 207928.1, 210432.4, 20739... | [0.5965432853216215, 0.7573690670940121, 0.493... | [7.43, 8.015, 7.812, 7.805, 7.938, 8.12, 8.161... | [55.836, 55.577, 55.237, 55.573, 55.512, 54.88... | [0.5333190389186867, 0.5314856001529347, 0.561... | [1.0451184794574826, 1.022569279389579, 1.0344... |
2022-01-24 | [196331.5, 204396.6, 209247.7, 208345.4, 20784... | [11.031038246691466, 11.518845457518038, 13.18... | [9.172, 9.131, 8.842, 8.987, 9.201, 9.489, 9.1... | [52.855, 53.409, 53.889, 54.006, 53.944, 53.07... | [0.8258148706118134, 0.8324800807378938, 0.886... | [1.018401194529847, 0.9991105440570864, 1.0109... |
2022-01-25 | [241509.4, 251172.7, 222005.2, 216005.8, 21869... | [9.760638960096678, 9.68924708625155, 13.61718... | [12.177, 11.365, 10.113, 10.497, 10.501, 10.84... | [49.297, 48.991, 50.58, 50.893, 50.853, 50.416... | [0.819102568494695, 0.8377693046259155, 0.9752... | [0.981306152448034, 0.9660540603723952, 0.9886... |
2022-01-26 | [263819.8, 263171.3, 260461.9, 257509.9, 25479... | [3.085553795959118, 2.6547312605083198, 2.1764... | [15.38, 15.623, 15.897, 15.944, 15.919, 15.777... | [37.445, 37.231, 36.495, 35.936, 35.819, 35.78... | [0.5047465592805648, 0.5081007278163424, 0.497... | [0.9335116655624799, 0.9270826921667246, 0.929... |
1304 rows × 6 columns
In [24]:
def merge(x1, x2, x3, x4, x5, x6): return sum([x1, x2, x3, x4, x5, x6], [])
In [25]:
daily_emiss_data.columns
Out[25]:
Index(['flow', 'rNOx', 'rO2', 'temp', 'rSO2', 'rsmoke'], dtype='object')
In [26]:
daily_emiss_data['feature_cols'] = daily_emiss_data.apply(lambda row: merge(row['flow'], row['rNOx'], row['rO2'], row['temp'], row['rSO2'], row['rsmoke']), axis=1) train_hourly_data = pd.DataFrame.from_records(np.array(daily_emiss_data['feature_cols'].values)) train_hourly_data.set_index(daily_emiss_data.index, inplace=True) train_hourly_data.columns = feature_cols train_hourly_data
Out[26]:
0_flow | 0_rNOx | 0_rO2 | 0_temp | 0_rSO2 | 0_rsmoke | 1_flow | 1_rNOx | 1_rO2 | 1_temp | ... | 22_rO2 | 22_temp | 22_rSO2 | 22_rsmoke | 23_flow | 23_rNOx | 23_rO2 | 23_temp | 23_rSO2 | 23_rsmoke | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
days | |||||||||||||||||||||
2018-03-23 | 244136.19 | 234599.89 | 249264.88 | 229360.17 | 236416.45 | 236113.88 | 243835.88 | 254941.06 | 263172.44 | 265048.62 | ... | 1.108454 | 1.149077 | 1.116597 | 1.070484 | 0.965953 | 1.005916 | 1.013989 | 0.994012 | 0.978667 | 0.945210 |
2018-03-24 | 234070.07 | 235778.62 | 231371.79 | 234002.61 | 224972.48 | 212372.97 | 227885.84 | 252032.30 | 257109.81 | 252191.47 | ... | 1.139022 | 1.155038 | 1.161774 | 1.112287 | 0.960906 | 0.885317 | 0.892926 | 0.885317 | 0.870327 | 0.800264 |
2018-03-25 | 228939.37 | 232613.69 | 229586.17 | 235035.84 | 227862.00 | 233114.53 | 224467.42 | 252500.03 | 240797.89 | 247235.60 | ... | 0.924819 | 0.975402 | 1.093083 | 0.968145 | 0.934758 | 0.900796 | 0.934184 | 0.954531 | 0.960316 | 0.926621 |
2018-03-26 | 231112.06 | 225984.70 | 224547.59 | 221822.01 | 219699.59 | 216640.45 | 228010.51 | 255410.68 | 260558.74 | 253688.93 | ... | 0.849763 | 0.929620 | 0.984937 | 0.971792 | 0.953290 | 0.882604 | 0.943758 | 0.960906 | 0.995540 | 1.021165 |
2018-03-27 | 226140.95 | 219510.17 | 215491.43 | 205450.35 | 209391.39 | 202650.57 | 207802.10 | 247740.44 | 254831.56 | 251766.47 | ... | 1.292986 | 1.344673 | 1.348917 | 1.185938 | 1.146697 | 1.155129 | 1.088680 | 1.045839 | 0.937636 | 0.894574 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2022-01-22 | 217544.30 | 223416.00 | 221987.60 | 216571.40 | 216474.30 | 217356.80 | 215403.80 | 216200.90 | 224753.10 | 226410.50 | ... | 1.036403 | 1.037442 | 1.039126 | 1.045166 | 1.045001 | 1.037889 | 1.040583 | 1.039620 | 1.038964 | 1.041814 |
2022-01-23 | 204086.40 | 213480.50 | 207928.10 | 210432.40 | 207398.30 | 205233.10 | 202121.80 | 203224.70 | 218894.90 | 238739.10 | ... | 1.024024 | 1.032411 | 1.035073 | 1.033276 | 1.035860 | 1.033211 | 1.031058 | 1.025498 | 1.023998 | 1.022248 |
2022-01-24 | 196331.50 | 204396.60 | 209247.70 | 208345.40 | 207840.00 | 203811.20 | 204427.60 | 208614.90 | 204340.80 | 206945.40 | ... | 1.007186 | 1.008622 | 1.012684 | 1.021570 | 1.022493 | 1.015565 | 1.008908 | 0.994918 | 0.986704 | 0.987378 |
2022-01-25 | 241509.40 | 251172.70 | 222005.20 | 216005.80 | 218697.30 | 217854.50 | 198460.70 | 199237.90 | 203870.10 | 224508.80 | ... | 0.936690 | 0.932645 | 0.935892 | 0.934892 | 0.933680 | 0.932380 | 0.928861 | 0.928384 | 0.929663 | 0.932935 |
2022-01-26 | 263819.80 | 263171.30 | 260461.90 | 257509.90 | 254797.00 | 255295.30 | 255801.90 | 261359.50 | 267883.50 | 271961.30 | ... | 0.929660 | 0.929660 | 0.929660 | 0.929507 | 0.931257 | 0.932774 | 0.933383 | 0.933327 | 0.934369 | 0.934365 |
1304 rows × 144 columns
In [27]:
hourly_data_fixed['cNOx'] = hourly_data_fixed.flow * hourly_data_fixed.rNOx hourly_data_fixed['cO2'] = hourly_data_fixed.flow * hourly_data_fixed.rO2 hourly_data_fixed['cSO2'] = hourly_data_fixed.flow * hourly_data_fixed.rSO2 hourly_data_fixed['csmoke'] = hourly_data_fixed.flow * hourly_data_fixed.rsmoke
In [28]:
grp = hourly_data_fixed.groupby('days') new_data = grp.agg({'cNOx': sum, 'cSO2':sum, 'cO2':sum, 'csmoke':sum, 'flow':np.mean, 'rNOx':np.mean, 'rO2':np.mean, 'temp':np.mean,'rSO2':np.mean, 'rsmoke':np.mean}).reset_index() new_data.shape
Out[28]:
(1304, 11)
In [29]:
final_hourly_data = train_hourly_data.reset_index().merge(new_data, how='left', on='days') final_hourly_data.shape
Out[29]:
(1304, 155)
In [30]:
daily_data.shape
Out[30]:
(1108, 44)
In [31]:
import seaborn as sns from scipy.stats import norm
In [32]:
sns.distplot(daily_data['燃料消耗量(吨)'], fit=norm)
C:\Users\zhaojh\Miniconda3\envs\py38\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[32]:
<AxesSubplot:xlabel='燃料消耗量(吨)', ylabel='Density'>
In [33]:
fig = plt.figure(figsize=(40, 40)) for index, col in enumerate(daily_data.columns): if col != 'days': # u = x_data[col].mean() # std = x_data[col].std() try: plt.subplot(9,5,index+1) plt.title(col) plt.hist(daily_data[col]) except: print(col)
In [34]:
use_cols = ['days'] for col in daily_data.columns: if col != 'days': if daily_data[col].value_counts().shape[0] > 1 and daily_data[col].isna().sum() / daily_data.shape[0] <= 0.01: use_cols.append(col)
In [35]:
use_cols
Out[35]:
['days',
'发电量(千瓦时)',
'供热量(吉焦)',
'机组运行时间(小时)',
'硫分(%)',
'脱硫剂使用量(吨)',
'脱硫设施运行时间(小时)',
'脱硝还原剂消耗量(吨)',
'脱硝运行时间(小时)',
'燃料消耗量(吨)']
In [36]:
tmp_data = daily_data[use_cols].copy() tmp_data.shape
Out[36]:
(1108, 10)
In [37]:
train_data = tmp_data.merge(final_hourly_data, on='days', how='left')
In [38]:
train_data.shape
Out[38]:
(1108, 164)
In [39]:
train_data['机组运行时间(小时)'].value_counts()
Out[39]:
24.0 1103 21.0 2 0.0 1 15.5 1 19.0 1 Name: 机组运行时间(小时), dtype: int64
In [40]:
valid_data = train_data[~((train_data['机组运行时间(小时)']==0)|(train_data['燃料消耗量(吨)']<=200)|(train_data.flow==0)|(train_data.flow.isna()))].copy() valid_data.shape
Out[40]:
(1087, 164)
In [41]:
import datetime as dt
In [42]:
def cal_timedelta(x): date = dt.datetime.strptime(x, '%Y-%m-%d') date = dt.date(date.year, date.month, date.day) start_date = dt.date(date.year, 1, 1) time_delta = (date - start_date).days return time_delta
In [43]:
valid_data['day_of_year'] = valid_data.days.apply(cal_timedelta) valid_data
Out[43]:
days | 发电量(千瓦时) | 供热量(吉焦) | 机组运行时间(小时) | 硫分(%) | 脱硫剂使用量(吨) | 脱硫设施运行时间(小时) | 脱硝还原剂消耗量(吨) | 脱硝运行时间(小时) | 燃料消耗量(吨) | ... | cSO2 | cO2 | csmoke | flow | rNOx | rO2 | temp | rSO2 | rsmoke | day_of_year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-10-01 | 156796.00 | 6536.83 | 24.0 | 0.51 | 5.06 | 24.0 | 2.98 | 24.0 | 323 | ... | 1.810937e+07 | 3.745944e+07 | 5.495410e+05 | 162345.192917 | 24.417792 | 9.900000 | 51.250000 | 4.705029 | 0.182338 | 273 |
1 | 2018-10-02 | 133984.00 | 2484.64 | 24.0 | 0.51 | 5.04 | 24.0 | 2.97 | 24.0 | 218 | ... | 1.337057e+07 | 2.832146e+07 | 3.078217e+05 | 140175.330833 | 18.705945 | 9.400000 | 50.679167 | 3.675542 | 0.166718 | 274 |
2 | 2018-10-03 | 134023.00 | 3020.83 | 24.0 | 0.51 | 5.04 | 24.0 | 2.95 | 24.0 | 212 | ... | 2.404455e+07 | 3.174159e+07 | 4.348207e+05 | 154686.184167 | 20.891791 | 8.550000 | 52.808333 | 6.440365 | 0.117143 | 275 |
3 | 2018-10-04 | 124765.00 | 5599.23 | 24.0 | 0.51 | 5.03 | 24.0 | 2.98 | 24.0 | 223 | ... | 8.668474e+06 | 2.511504e+07 | 1.946970e+06 | 120345.545833 | 18.457892 | 10.202083 | 48.854167 | 2.364306 | 0.761071 | 276 |
4 | 2018-10-05 | 134414.00 | 4702.65 | 24.0 | 0.51 | 5.06 | 24.0 | 3.01 | 24.0 | 243 | ... | 1.579668e+06 | 4.106346e+07 | 5.390776e+06 | 162533.103542 | 22.017321 | 11.497917 | 45.783333 | 0.339330 | 1.858999 | 277 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1103 | 2022-01-22 | 52.24 | 12472.00 | 24.0 | 0.59 | 8.46 | 24.0 | 4.56 | 24.0 | 822 | ... | 3.028139e+06 | 4.149625e+07 | 5.438282e+06 | 218349.604167 | 2.174717 | 7.921417 | 55.441542 | 0.576979 | 1.037822 | 21 |
1104 | 2022-01-23 | 51.36 | 12051.00 | 24.0 | 0.59 | 8.46 | 24.0 | 4.58 | 24.0 | 790 | ... | 3.412421e+06 | 4.422277e+07 | 5.194162e+06 | 210121.608333 | 5.565075 | 8.756333 | 54.574333 | 0.678481 | 1.030052 | 22 |
1105 | 2022-01-24 | 51.12 | 11276.00 | 24.0 | 0.59 | 8.43 | 24.0 | 4.57 | 24.0 | 751 | ... | 4.146250e+06 | 4.655727e+07 | 5.133802e+06 | 211378.329167 | 10.326585 | 9.110167 | 53.031042 | 0.818827 | 1.012412 | 23 |
1106 | 2022-01-25 | 49.32 | 11007.00 | 24.0 | 0.59 | 8.43 | 24.0 | 4.56 | 24.0 | 672 | ... | 3.971702e+06 | 7.959093e+07 | 5.497492e+06 | 240801.208333 | 4.874698 | 13.636042 | 42.908458 | 0.698443 | 0.953106 | 24 |
1107 | 2022-01-26 | 29.64 | 8132.00 | 24.0 | 0.59 | 8.44 | 24.0 | 4.57 | 24.0 | 484 | ... | 5.050733e+06 | 9.866431e+07 | 5.879454e+06 | 263197.579167 | 1.481812 | 15.621583 | 36.412917 | 0.801224 | 0.930781 | 25 |
1087 rows × 165 columns
In [44]:
valid_data.to_csv( './train_data.csv', encoding='utf-8-sig', index=False )
In [ ]: