ICEEMDAN-Solar_power-forecast/数据预处理.ipynb

647 KiB
Raw Permalink Blame History

In [1]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tensorflow.keras import Sequential

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
C:\Users\asus\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.3' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
C:\Users\asus\AppData\Roaming\Python\Python39\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
In [2]:
data=pd.read_csv(r'D:\project\小论文1-基于ICEEMDAN分解的时序高维变化的短期光伏功率预测模型\CEEMAN-PosConv1dbiLSTM-LSTM\对比模型\58-Site_DKA-M17_C-Phase.csv')
In [3]:
data
Out[3]:
timestamp Active_Energy_Delivered_Received Current_Phase_Average Active_Power Performance_Ratio Wind_Speed Weather_Temperature_Celsius Weather_Relative_Humidity Global_Horizontal_Radiation Diffuse_Horizontal_Radiation Wind_Direction Weather_Daily_Rainfall Radiation_Global_Tilted Radiation_Diffuse_Tilted
0 2010-02-18 14:35:00 0.0 0.000000 0.000000 NaN 6.793873 35.132046 13.933495 1000.515625 97.682610 126.266418 0.0 NaN NaN
1 2010-02-18 14:40:00 0.0 0.000000 0.000000 NaN 6.926013 34.586330 14.363612 989.110413 102.564949 116.272385 0.0 NaN NaN
2 2010-02-18 14:45:00 0.0 0.000000 0.000000 NaN 6.824874 34.628662 13.933328 977.882629 102.709160 141.693970 0.0 NaN NaN
3 2010-02-18 14:50:00 0.0 0.000000 0.000000 NaN 5.291194 35.258572 13.457552 963.508484 100.324097 130.381912 0.0 NaN NaN
4 2010-02-18 14:55:00 0.0 0.000000 0.000000 NaN 6.065388 35.220058 13.886837 939.744995 105.697617 126.441544 0.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1502295 2024-07-17 12:20:00 83264.0 17.695337 4.331866 87.961319 NaN NaN NaN NaN NaN NaN NaN 947.065369 144.291245
1502296 2024-07-17 12:25:00 83265.0 17.795330 4.350333 90.215775 NaN NaN NaN NaN NaN NaN NaN 927.335022 131.287155
1502297 2024-07-17 12:30:00 83265.0 17.962000 4.386533 94.664726 NaN 14.208377 33.600403 823.925476 83.903313 19.253492 0.0 891.106995 126.447548
1502298 2024-07-17 12:35:00 83265.0 17.877998 4.375267 93.486641 NaN 14.223358 33.683571 817.790710 76.371666 19.294001 0.0 900.018799 123.445114
1502299 2024-07-17 12:40:00 83266.0 17.829998 4.369600 90.526978 NaN 14.428312 32.949017 820.284790 74.797913 19.167789 0.0 928.239990 123.938103

1502300 rows × 14 columns

In [6]:
#只要2018.4.1-2019.4.1一年的数据
data2=data.iloc[853133:958253, :]
In [7]:
data2
Out[7]:
timestamp Active_Energy_Delivered_Received Current_Phase_Average Active_Power Performance_Ratio Wind_Speed Weather_Temperature_Celsius Weather_Relative_Humidity Global_Horizontal_Radiation Diffuse_Horizontal_Radiation Wind_Direction Weather_Daily_Rainfall Radiation_Global_Tilted Radiation_Diffuse_Tilted
853133 2018-04-01 00:00:00 18104.0 0.997333 0.0 0.0 NaN 19.779453 40.025826 3.232706 1.690531 64.372742 0.0 3.565593 0.742383
853134 2018-04-01 00:05:00 18104.0 0.997333 0.0 0.0 NaN 19.714937 39.605961 3.194991 1.576346 65.954178 0.0 3.469451 0.663080
853135 2018-04-01 00:10:00 18104.0 0.996000 0.0 0.0 NaN 19.549330 39.608631 3.070866 1.576157 65.347725 0.0 3.354114 0.540446
853136 2018-04-01 00:15:00 18104.0 1.000000 0.0 0.0 NaN 19.405870 39.680702 3.038623 1.482489 67.103271 0.0 3.365968 0.597973
853137 2018-04-01 00:20:00 18104.0 1.000000 0.0 0.0 NaN 19.387363 39.319881 2.656474 1.134153 66.430733 0.0 3.222809 0.530707
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
958248 2019-03-31 23:35:00 29021.0 0.991333 0.0 0.0 NaN 13.303740 34.212711 1.210789 0.787026 34.165325 0.0 3.271109 0.476681
958249 2019-03-31 23:40:00 29021.0 0.995333 0.0 0.0 NaN 13.120920 34.394939 2.142980 1.582670 34.202522 0.0 3.163039 0.444219
958250 2019-03-31 23:45:00 29021.0 0.995333 0.0 0.0 NaN 12.879215 35.167400 1.926214 1.545889 34.233902 0.0 3.197096 0.475794
958251 2019-03-31 23:50:00 29021.0 0.999333 0.0 0.0 NaN 12.915867 35.359989 1.317695 0.851529 34.308563 0.0 2.873335 0.320598
958252 2019-03-31 23:55:00 29021.0 1.000000 0.0 0.0 NaN 13.134816 34.500034 1.043269 0.597816 34.228458 0.0 2.947993 0.294085

105120 rows × 14 columns

In [8]:
data2.plot(legend=True, subplots=True, figsize=(12, 8))
Out[8]:
array([<Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >,
       <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >,
       <Axes: >], dtype=object)
No description has been provided for this image
In [9]:
data2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105120 entries, 853133 to 958252
Data columns (total 14 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   timestamp                         105120 non-null  object 
 1   Active_Energy_Delivered_Received  104221 non-null  float64
 2   Current_Phase_Average             104221 non-null  float64
 3   Active_Power                      104221 non-null  float64
 4   Performance_Ratio                 104221 non-null  float64
 5   Wind_Speed                        0 non-null       float64
 6   Weather_Temperature_Celsius       105120 non-null  float64
 7   Weather_Relative_Humidity         105120 non-null  float64
 8   Global_Horizontal_Radiation       105120 non-null  float64
 9   Diffuse_Horizontal_Radiation      105120 non-null  float64
 10  Wind_Direction                    105120 non-null  float64
 11  Weather_Daily_Rainfall            105120 non-null  float64
 12  Radiation_Global_Tilted           103998 non-null  float64
 13  Radiation_Diffuse_Tilted          103998 non-null  float64
dtypes: float64(13), object(1)
memory usage: 11.2+ MB
In [10]:
for dataset in [data2]:
    dataset.columns=['time','AE_Power','Current','Power','PR','Wind_speed','Temp','Humidity','GHI','DHI','Wind_dir','Rainfall','RGT','RDT']
In [11]:
data2
Out[11]:
time AE_Power Current Power PR Wind_speed Temp Humidity GHI DHI Wind_dir Rainfall RGT RDT
853133 2018-04-01 00:00:00 18104.0 0.997333 0.0 0.0 NaN 19.779453 40.025826 3.232706 1.690531 64.372742 0.0 3.565593 0.742383
853134 2018-04-01 00:05:00 18104.0 0.997333 0.0 0.0 NaN 19.714937 39.605961 3.194991 1.576346 65.954178 0.0 3.469451 0.663080
853135 2018-04-01 00:10:00 18104.0 0.996000 0.0 0.0 NaN 19.549330 39.608631 3.070866 1.576157 65.347725 0.0 3.354114 0.540446
853136 2018-04-01 00:15:00 18104.0 1.000000 0.0 0.0 NaN 19.405870 39.680702 3.038623 1.482489 67.103271 0.0 3.365968 0.597973
853137 2018-04-01 00:20:00 18104.0 1.000000 0.0 0.0 NaN 19.387363 39.319881 2.656474 1.134153 66.430733 0.0 3.222809 0.530707
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
958248 2019-03-31 23:35:00 29021.0 0.991333 0.0 0.0 NaN 13.303740 34.212711 1.210789 0.787026 34.165325 0.0 3.271109 0.476681
958249 2019-03-31 23:40:00 29021.0 0.995333 0.0 0.0 NaN 13.120920 34.394939 2.142980 1.582670 34.202522 0.0 3.163039 0.444219
958250 2019-03-31 23:45:00 29021.0 0.995333 0.0 0.0 NaN 12.879215 35.167400 1.926214 1.545889 34.233902 0.0 3.197096 0.475794
958251 2019-03-31 23:50:00 29021.0 0.999333 0.0 0.0 NaN 12.915867 35.359989 1.317695 0.851529 34.308563 0.0 2.873335 0.320598
958252 2019-03-31 23:55:00 29021.0 1.000000 0.0 0.0 NaN 13.134816 34.500034 1.043269 0.597816 34.228458 0.0 2.947993 0.294085

105120 rows × 14 columns

In [12]:
df = pd.DataFrame(data2)

# 将'Power'列移到最后一列
columns = df.columns.tolist()  # 获取列名列表
columns.remove('Power')  # 移除'Power'列
columns.append('Power')  # 将'Power'列添加到列名列表的末尾

# 使用重新排列后的列名重新构建DataFrame
df = df[columns]

# 打印结果,确认'Power'列已经移到最后
print(df)
                       time  AE_Power   Current   PR  Wind_speed       Temp  \
853133  2018-04-01 00:00:00   18104.0  0.997333  0.0         NaN  19.779453   
853134  2018-04-01 00:05:00   18104.0  0.997333  0.0         NaN  19.714937   
853135  2018-04-01 00:10:00   18104.0  0.996000  0.0         NaN  19.549330   
853136  2018-04-01 00:15:00   18104.0  1.000000  0.0         NaN  19.405870   
853137  2018-04-01 00:20:00   18104.0  1.000000  0.0         NaN  19.387363   
...                     ...       ...       ...  ...         ...        ...   
958248  2019-03-31 23:35:00   29021.0  0.991333  0.0         NaN  13.303740   
958249  2019-03-31 23:40:00   29021.0  0.995333  0.0         NaN  13.120920   
958250  2019-03-31 23:45:00   29021.0  0.995333  0.0         NaN  12.879215   
958251  2019-03-31 23:50:00   29021.0  0.999333  0.0         NaN  12.915867   
958252  2019-03-31 23:55:00   29021.0  1.000000  0.0         NaN  13.134816   

         Humidity       GHI       DHI   Wind_dir  Rainfall       RGT  \
853133  40.025826  3.232706  1.690531  64.372742       0.0  3.565593   
853134  39.605961  3.194991  1.576346  65.954178       0.0  3.469451   
853135  39.608631  3.070866  1.576157  65.347725       0.0  3.354114   
853136  39.680702  3.038623  1.482489  67.103271       0.0  3.365968   
853137  39.319881  2.656474  1.134153  66.430733       0.0  3.222809   
...           ...       ...       ...        ...       ...       ...   
958248  34.212711  1.210789  0.787026  34.165325       0.0  3.271109   
958249  34.394939  2.142980  1.582670  34.202522       0.0  3.163039   
958250  35.167400  1.926214  1.545889  34.233902       0.0  3.197096   
958251  35.359989  1.317695  0.851529  34.308563       0.0  2.873335   
958252  34.500034  1.043269  0.597816  34.228458       0.0  2.947993   

             RDT  Power  
853133  0.742383    0.0  
853134  0.663080    0.0  
853135  0.540446    0.0  
853136  0.597973    0.0  
853137  0.530707    0.0  
...          ...    ...  
958248  0.476681    0.0  
958249  0.444219    0.0  
958250  0.475794    0.0  
958251  0.320598    0.0  
958252  0.294085    0.0  

[105120 rows x 14 columns]
In [13]:
df
Out[13]:
time AE_Power Current PR Wind_speed Temp Humidity GHI DHI Wind_dir Rainfall RGT RDT Power
853133 2018-04-01 00:00:00 18104.0 0.997333 0.0 NaN 19.779453 40.025826 3.232706 1.690531 64.372742 0.0 3.565593 0.742383 0.0
853134 2018-04-01 00:05:00 18104.0 0.997333 0.0 NaN 19.714937 39.605961 3.194991 1.576346 65.954178 0.0 3.469451 0.663080 0.0
853135 2018-04-01 00:10:00 18104.0 0.996000 0.0 NaN 19.549330 39.608631 3.070866 1.576157 65.347725 0.0 3.354114 0.540446 0.0
853136 2018-04-01 00:15:00 18104.0 1.000000 0.0 NaN 19.405870 39.680702 3.038623 1.482489 67.103271 0.0 3.365968 0.597973 0.0
853137 2018-04-01 00:20:00 18104.0 1.000000 0.0 NaN 19.387363 39.319881 2.656474 1.134153 66.430733 0.0 3.222809 0.530707 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
958248 2019-03-31 23:35:00 29021.0 0.991333 0.0 NaN 13.303740 34.212711 1.210789 0.787026 34.165325 0.0 3.271109 0.476681 0.0
958249 2019-03-31 23:40:00 29021.0 0.995333 0.0 NaN 13.120920 34.394939 2.142980 1.582670 34.202522 0.0 3.163039 0.444219 0.0
958250 2019-03-31 23:45:00 29021.0 0.995333 0.0 NaN 12.879215 35.167400 1.926214 1.545889 34.233902 0.0 3.197096 0.475794 0.0
958251 2019-03-31 23:50:00 29021.0 0.999333 0.0 NaN 12.915867 35.359989 1.317695 0.851529 34.308563 0.0 2.873335 0.320598 0.0
958252 2019-03-31 23:55:00 29021.0 1.000000 0.0 NaN 13.134816 34.500034 1.043269 0.597816 34.228458 0.0 2.947993 0.294085 0.0

105120 rows × 14 columns

In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105120 entries, 853133 to 958252
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time        105120 non-null  object 
 1   AE_Power    104221 non-null  float64
 2   Current     104221 non-null  float64
 3   PR          104221 non-null  float64
 4   Wind_speed  0 non-null       float64
 5   Temp        105120 non-null  float64
 6   Humidity    105120 non-null  float64
 7   GHI         105120 non-null  float64
 8   DHI         105120 non-null  float64
 9   Wind_dir    105120 non-null  float64
 10  Rainfall    105120 non-null  float64
 11  RGT         103998 non-null  float64
 12  RDT         103998 non-null  float64
 13  Power       104221 non-null  float64
dtypes: float64(13), object(1)
memory usage: 11.2+ MB
In [15]:
# # 打印出所有包含 NaN 的位置
# nan_positions = data2.isna()

# print("Positions of NaN values:")
# print(nan_positions)
# # 将处理后的 DataFrame 保存为 Excel 文件
# excel_file_path = 'D:\project\小论文1-基于ICEEMDAN分解的时序高维变化的短期光伏功率预测模型\CEEMAN-PosConv1dbiLSTM-LSTM\对比模型\processed_data.xlsx'  # 定义 Excel 文件路径和文件名

# nan_positions.to_excel(excel_file_path, index=False)  # 将 DataFrame 保存为 Excel 文件,不包含索引
In [16]:
data3 = df.drop("Wind_speed", axis=1)
In [17]:
import matplotlib as plt
data3.plot(legend=True, subplots=True, figsize=(12, 8))
Out[17]:
array([<Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >,
       <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >],
      dtype=object)
No description has been provided for this image
In [18]:
data3
Out[18]:
time AE_Power Current PR Temp Humidity GHI DHI Wind_dir Rainfall RGT RDT Power
853133 2018-04-01 00:00:00 18104.0 0.997333 0.0 19.779453 40.025826 3.232706 1.690531 64.372742 0.0 3.565593 0.742383 0.0
853134 2018-04-01 00:05:00 18104.0 0.997333 0.0 19.714937 39.605961 3.194991 1.576346 65.954178 0.0 3.469451 0.663080 0.0
853135 2018-04-01 00:10:00 18104.0 0.996000 0.0 19.549330 39.608631 3.070866 1.576157 65.347725 0.0 3.354114 0.540446 0.0
853136 2018-04-01 00:15:00 18104.0 1.000000 0.0 19.405870 39.680702 3.038623 1.482489 67.103271 0.0 3.365968 0.597973 0.0
853137 2018-04-01 00:20:00 18104.0 1.000000 0.0 19.387363 39.319881 2.656474 1.134153 66.430733 0.0 3.222809 0.530707 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
958248 2019-03-31 23:35:00 29021.0 0.991333 0.0 13.303740 34.212711 1.210789 0.787026 34.165325 0.0 3.271109 0.476681 0.0
958249 2019-03-31 23:40:00 29021.0 0.995333 0.0 13.120920 34.394939 2.142980 1.582670 34.202522 0.0 3.163039 0.444219 0.0
958250 2019-03-31 23:45:00 29021.0 0.995333 0.0 12.879215 35.167400 1.926214 1.545889 34.233902 0.0 3.197096 0.475794 0.0
958251 2019-03-31 23:50:00 29021.0 0.999333 0.0 12.915867 35.359989 1.317695 0.851529 34.308563 0.0 2.873335 0.320598 0.0
958252 2019-03-31 23:55:00 29021.0 1.000000 0.0 13.134816 34.500034 1.043269 0.597816 34.228458 0.0 2.947993 0.294085 0.0

105120 rows × 13 columns

In [21]:
data4 = pd.DataFrame(data3)

# 将 data3 保存为 Excel 文件
csv_file_path = 'D:\project\小论文1-基于ICEEMDAN分解的时序高维变化的短期光伏功率预测模型\CEEMAN-PosConv1dbiLSTM-LSTM\对比模型\data3.csv'  # 定义 Excel 文件路径和文件名

data4.to_csv(csv_file_path, index=False)  # 将 DataFrame 保存为 Excel 文件,不包含索引

print(f"DataFrame saved to {csv_file_path}")
DataFrame saved to D:\project\小论文1-基于ICEEMDAN分解的时序高维变化的短期光伏功率预测模型\CEEMAN-PosConv1dbiLSTM-LSTM\对比模型\data3.csv
In [21]:
import matplotlib as plt
data3.plot(legend=True, subplots=True, figsize=(12, 8))
Out[21]:
array([<Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >,
       <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >, <Axes: >],
      dtype=object)
No description has been provided for this image
In [ ]: