wgz_forecast/wgz_forecast(lh)/load/matrix2series.ipynb

71 KiB
Raw Blame History

In [198]:
import pandas as pd
import numpy as np
In [199]:
import pandas as pd

# 读取原始 CSV 文件
data = pd.read_csv('./data/load_original.csv')

# 打印原始数据的前几行以检查格式
print("原始数据:")
print(data.head())

# 定义一个函数来清理数值列
def clean_numeric_column(column):
    # 将逗号替换为小数点,并转换为浮点数
    return column.apply(lambda x: float(str(x).replace(',', '.')) if isinstance(x, str) else x)

# 获取所有需要清理的数值列(假设从 'h1' 到 'h24'
numeric_columns = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10',
                   'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18', 'h19',
                   'h20', 'h21', 'h22', 'h23', 'h24']

# 对每个数值列应用清理函数
for col in numeric_columns:
    data[col] = clean_numeric_column(data[col])

# 打印清理后的数据
print("\n清理后的数据:")
print(data.head())
原始数据:
   zone_id  year  month  day      h1      h2      h3      h4      h5      h6  \
0        1  2004      1    1  16,853  16,450  16,517  16,873  17,064  17,727   
1        1  2004      1    2  14,155  14,038  14,019  14,489  14,920  16,072   
2        1  2004      1    3  14,439  14,272  14,109  14,081  14,775  15,491   
3        1  2004      1    4  11,273  10,415   9,943   9,859   9,881  10,248   
4        1  2004      1    5  10,750  10,321  10,107  10,065  10,419  12,101   

   ...     h15     h16     h17     h18     h19     h20     h21     h22  \
0  ...  13,518  13,138  14,130  16,809  18,150  18,235  17,925  16,904   
1  ...  16,127  15,448  15,839  17,727  18,895  18,650  18,443  17,580   
2  ...  13,507  13,414  13,826  15,825  16,996  16,394  15,406  14,278   
3  ...  14,207  13,614  14,162  16,237  17,430  17,218  16,633  15,238   
4  ...  13,845  14,350  15,501  17,307  18,786  19,089  19,192  18,416   

      h23     h24  
0  16,162  14,750  
1  16,467  15,258  
2  13,315  12,424  
3  13,580  11,727  
4  17,006  16,018  

[5 rows x 28 columns]

清理后的数据:
   zone_id  year  month  day      h1      h2      h3      h4      h5      h6  \
0        1  2004      1    1  16.853  16.450  16.517  16.873  17.064  17.727   
1        1  2004      1    2  14.155  14.038  14.019  14.489  14.920  16.072   
2        1  2004      1    3  14.439  14.272  14.109  14.081  14.775  15.491   
3        1  2004      1    4  11.273  10.415   9.943   9.859   9.881  10.248   
4        1  2004      1    5  10.750  10.321  10.107  10.065  10.419  12.101   

   ...     h15     h16     h17     h18     h19     h20     h21     h22  \
0  ...  13.518  13.138  14.130  16.809  18.150  18.235  17.925  16.904   
1  ...  16.127  15.448  15.839  17.727  18.895  18.650  18.443  17.580   
2  ...  13.507  13.414  13.826  15.825  16.996  16.394  15.406  14.278   
3  ...  14.207  13.614  14.162  16.237  17.430  17.218  16.633  15.238   
4  ...  13.845  14.350  15.501  17.307  18.786  19.089  19.192  18.416   

      h23     h24  
0  16.162  14.750  
1  16.467  15.258  
2  13.315  12.424  
3  13.580  11.727  
4  17.006  16.018  

[5 rows x 28 columns]
In [200]:
use_data = data[data['zone_id']==1].drop(columns=data.columns[:4])
In [201]:
user_data_flatten=use_data.values.flatten()
In [202]:
user_data_flatten
Out[202]:
array([16.853, 16.45 , 16.517, ...,    nan,    nan,    nan])
In [203]:
import pandas as pd

# 定义起始日期和时间
start_date = '2004-01-01 00:00:00'
# 使用 pd.date_range 生成时间索引
time_index = pd.date_range(start=start_date, periods=len(use_data.values.flatten()), freq='15T')

# 打印生成的时间索引
print(len(time_index))
39600
C:\Users\liuhao\AppData\Local\Temp\ipykernel_18300\3270028511.py:6: FutureWarning: 'T' is deprecated and will be removed in a future version, please use 'min' instead.
  time_index = pd.date_range(start=start_date, periods=len(use_data.values.flatten()), freq='15T')
In [204]:
time_index
Out[204]:
DatetimeIndex(['2004-01-01 00:00:00', '2004-01-01 00:15:00',
               '2004-01-01 00:30:00', '2004-01-01 00:45:00',
               '2004-01-01 01:00:00', '2004-01-01 01:15:00',
               '2004-01-01 01:30:00', '2004-01-01 01:45:00',
               '2004-01-01 02:00:00', '2004-01-01 02:15:00',
               ...
               '2005-02-16 09:30:00', '2005-02-16 09:45:00',
               '2005-02-16 10:00:00', '2005-02-16 10:15:00',
               '2005-02-16 10:30:00', '2005-02-16 10:45:00',
               '2005-02-16 11:00:00', '2005-02-16 11:15:00',
               '2005-02-16 11:30:00', '2005-02-16 11:45:00'],
              dtype='datetime64[ns]', length=39600, freq='15min')
In [205]:
use_data.values.flatten()
Out[205]:
array([16.853, 16.45 , 16.517, ...,    nan,    nan,    nan])
In [206]:
# 展平数据并创建Series
data_series = pd.Series(use_data.values.flatten(), index=time_index)
In [207]:
# 重采样为每小时,并对每小时的数据进行求和
hourly_data = data_series.resample('H').sum().to_frame().reset_index()

# 打印结果
print(hourly_data)
                   index       0
0    2004-01-01 00:00:00  66.693
1    2004-01-01 01:00:00  72.720
2    2004-01-01 02:00:00  72.185
3    2004-01-01 03:00:00  56.217
4    2004-01-01 04:00:00  67.324
...                  ...     ...
9895 2005-02-16 07:00:00   0.000
9896 2005-02-16 08:00:00   0.000
9897 2005-02-16 09:00:00   0.000
9898 2005-02-16 10:00:00   0.000
9899 2005-02-16 11:00:00   0.000

[9900 rows x 2 columns]
C:\Users\liuhao\AppData\Local\Temp\ipykernel_18300\3846816627.py:2: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  hourly_data = data_series.resample('H').sum().to_frame().reset_index()
In [208]:
hourly_data.columns = ['time', 'power']
In [209]:
import matplotlib.pyplot as plt
In [210]:
plt.plot(hourly_data.power[:500])
Out[210]:
[<matplotlib.lines.Line2D at 0x2d37c1168d0>]
No description has been provided for this image
In [211]:
hourly_data.to_csv('data/load_data_hourly.csv', index=False, encoding='utf-8-sig')
In [211]: