forked from EEBD_AI/wgz_forecast
71 KiB
71 KiB
In [198]:
import pandas as pd import numpy as np
In [199]:
import pandas as pd # 读取原始 CSV 文件 data = pd.read_csv('./data/load_original.csv') # 打印原始数据的前几行以检查格式 print("原始数据:") print(data.head()) # 定义一个函数来清理数值列 def clean_numeric_column(column): # 将逗号替换为小数点,并转换为浮点数 return column.apply(lambda x: float(str(x).replace(',', '.')) if isinstance(x, str) else x) # 获取所有需要清理的数值列(假设从 'h1' 到 'h24') numeric_columns = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10', 'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18', 'h19', 'h20', 'h21', 'h22', 'h23', 'h24'] # 对每个数值列应用清理函数 for col in numeric_columns: data[col] = clean_numeric_column(data[col]) # 打印清理后的数据 print("\n清理后的数据:") print(data.head())
原始数据: zone_id year month day h1 h2 h3 h4 h5 h6 \ 0 1 2004 1 1 16,853 16,450 16,517 16,873 17,064 17,727 1 1 2004 1 2 14,155 14,038 14,019 14,489 14,920 16,072 2 1 2004 1 3 14,439 14,272 14,109 14,081 14,775 15,491 3 1 2004 1 4 11,273 10,415 9,943 9,859 9,881 10,248 4 1 2004 1 5 10,750 10,321 10,107 10,065 10,419 12,101 ... h15 h16 h17 h18 h19 h20 h21 h22 \ 0 ... 13,518 13,138 14,130 16,809 18,150 18,235 17,925 16,904 1 ... 16,127 15,448 15,839 17,727 18,895 18,650 18,443 17,580 2 ... 13,507 13,414 13,826 15,825 16,996 16,394 15,406 14,278 3 ... 14,207 13,614 14,162 16,237 17,430 17,218 16,633 15,238 4 ... 13,845 14,350 15,501 17,307 18,786 19,089 19,192 18,416 h23 h24 0 16,162 14,750 1 16,467 15,258 2 13,315 12,424 3 13,580 11,727 4 17,006 16,018 [5 rows x 28 columns] 清理后的数据: zone_id year month day h1 h2 h3 h4 h5 h6 \ 0 1 2004 1 1 16.853 16.450 16.517 16.873 17.064 17.727 1 1 2004 1 2 14.155 14.038 14.019 14.489 14.920 16.072 2 1 2004 1 3 14.439 14.272 14.109 14.081 14.775 15.491 3 1 2004 1 4 11.273 10.415 9.943 9.859 9.881 10.248 4 1 2004 1 5 10.750 10.321 10.107 10.065 10.419 12.101 ... h15 h16 h17 h18 h19 h20 h21 h22 \ 0 ... 13.518 13.138 14.130 16.809 18.150 18.235 17.925 16.904 1 ... 16.127 15.448 15.839 17.727 18.895 18.650 18.443 17.580 2 ... 13.507 13.414 13.826 15.825 16.996 16.394 15.406 14.278 3 ... 14.207 13.614 14.162 16.237 17.430 17.218 16.633 15.238 4 ... 13.845 14.350 15.501 17.307 18.786 19.089 19.192 18.416 h23 h24 0 16.162 14.750 1 16.467 15.258 2 13.315 12.424 3 13.580 11.727 4 17.006 16.018 [5 rows x 28 columns]
In [200]:
use_data = data[data['zone_id']==1].drop(columns=data.columns[:4])
In [201]:
user_data_flatten=use_data.values.flatten()
In [202]:
user_data_flatten
Out[202]:
array([16.853, 16.45 , 16.517, ..., nan, nan, nan])
In [203]:
import pandas as pd # 定义起始日期和时间 start_date = '2004-01-01 00:00:00' # 使用 pd.date_range 生成时间索引 time_index = pd.date_range(start=start_date, periods=len(use_data.values.flatten()), freq='15T') # 打印生成的时间索引 print(len(time_index))
39600
C:\Users\liuhao\AppData\Local\Temp\ipykernel_18300\3270028511.py:6: FutureWarning: 'T' is deprecated and will be removed in a future version, please use 'min' instead. time_index = pd.date_range(start=start_date, periods=len(use_data.values.flatten()), freq='15T')
In [204]:
time_index
Out[204]:
DatetimeIndex(['2004-01-01 00:00:00', '2004-01-01 00:15:00', '2004-01-01 00:30:00', '2004-01-01 00:45:00', '2004-01-01 01:00:00', '2004-01-01 01:15:00', '2004-01-01 01:30:00', '2004-01-01 01:45:00', '2004-01-01 02:00:00', '2004-01-01 02:15:00', ... '2005-02-16 09:30:00', '2005-02-16 09:45:00', '2005-02-16 10:00:00', '2005-02-16 10:15:00', '2005-02-16 10:30:00', '2005-02-16 10:45:00', '2005-02-16 11:00:00', '2005-02-16 11:15:00', '2005-02-16 11:30:00', '2005-02-16 11:45:00'], dtype='datetime64[ns]', length=39600, freq='15min')
In [205]:
use_data.values.flatten()
Out[205]:
array([16.853, 16.45 , 16.517, ..., nan, nan, nan])
In [206]:
# 展平数据并创建Series data_series = pd.Series(use_data.values.flatten(), index=time_index)
In [207]:
# 重采样为每小时,并对每小时的数据进行求和 hourly_data = data_series.resample('H').sum().to_frame().reset_index() # 打印结果 print(hourly_data)
index 0 0 2004-01-01 00:00:00 66.693 1 2004-01-01 01:00:00 72.720 2 2004-01-01 02:00:00 72.185 3 2004-01-01 03:00:00 56.217 4 2004-01-01 04:00:00 67.324 ... ... ... 9895 2005-02-16 07:00:00 0.000 9896 2005-02-16 08:00:00 0.000 9897 2005-02-16 09:00:00 0.000 9898 2005-02-16 10:00:00 0.000 9899 2005-02-16 11:00:00 0.000 [9900 rows x 2 columns]
C:\Users\liuhao\AppData\Local\Temp\ipykernel_18300\3846816627.py:2: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead. hourly_data = data_series.resample('H').sum().to_frame().reset_index()
In [208]:
hourly_data.columns = ['time', 'power']
In [209]:
import matplotlib.pyplot as plt
In [210]:
plt.plot(hourly_data.power[:500])
Out[210]:
[<matplotlib.lines.Line2D at 0x2d37c1168d0>]
In [211]:
hourly_data.to_csv('data/load_data_hourly.csv', index=False, encoding='utf-8-sig')
In [211]: