22-T67/test.ipynb

1.0 MiB
Raw Permalink Blame History

In [30]:
import pandas as pd
import os
import datetime as dt
from prophet import Prophet
Importing plotly failed. Interactive plots will not work.
In [10]:
data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')]
data_folder.sort()
In [12]:
data_folder
Out[12]:
['城市_20150101-20141231',
 '城市_20160101-20161231',
 '城市_20170101-20171231',
 '城市_20180101-20181231',
 '城市_20190101-20191231',
 '城市_20200101-20201231',
 '城市_20210101-20211231']
In [18]:
# 一个读取数据并合成成一个大文件的函数
total_data = pd.DataFrame()
for folder in data__folder:
    for file in os.listdir(f"./data/{folder}"):
        if file.endswith('csv'):
            data = pd.read_csv(f'./data/{folder}/{file}')
            use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy()
            total_data = pd.concat([total_data, use_data])
In [19]:
total_data.shape
Out[19]:
(119419, 394)
In [20]:
total_data.head()
Out[20]:
date hour type 北京 天津 石家庄 唐山 秦皇岛 邯郸 保定 ... 果洛藏族自治州 玉树藏族自治州 海西蒙古族藏族自治州 博尔塔拉蒙古自治州 克孜勒苏柯尔克孜自治州 兰州新区 赣江新区 儋州 雄安新区 西咸新区
1 20150504 0 PM2.5 21.0 16.0 36.0 29.0 22.0 105.0 53.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 20150504 0 O3 70.0 57.0 17.0 68.0 52.0 47.0 61.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
16 20150504 1 PM2.5 15.0 16.0 44.0 27.0 16.0 74.0 31.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
24 20150504 1 O3 72.0 56.0 6.0 75.0 50.0 53.0 64.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
31 20150504 2 PM2.5 15.0 19.0 62.0 25.0 12.0 57.0 22.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 394 columns

In [36]:
def concat_date(x, y):
    time_str = f"{x} {y}:00:00"
    return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")
In [70]:
total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1)
total_data.ds = pd.to_datetime(total_data.ds)
In [71]:
total_data.reset_index(drop=True, inplace=True)
In [72]:
PM25_data = total_data[total_data["type"]=='PM2.5'].reset_index(drop=True)
O3_data = total_data[total_data["type"]=='O3'].reset_index(drop=True)
In [42]:
def build_model(city: str, data: pd.DataFrame, dtype:str, split_date="2021-01-01 00:00:00"):
    """_summary_

    Args:
        city (str): _description_
        data (pd.DataFrame): _description_
        dtype (str): _description_
        split_date (str, optional): _description_. Defaults to "2021-01-01 00:00:00".
    """
    use_data = data[(data['type']==dtype)][["ds", city]].copy()
    use_data.columns = ["ds", "y"]
    train_data = use_data[use_data.ds < split_date].copy()
    test_data = use_data[use_data.ds >= split_date].copy()
    model=Prophet(growth="linear",
              yearly_seasonality=True,
              weekly_seasonality=False,
              daily_seasonality=False,
              seasonality_mode="multiplicative",
              seasonality_prior_scale=12,
              )
    model.fit(train)
    forecast=model.predict(test)
In [73]:
train, test = build_model('北京', total_data, dtype='PM2.5')
In [67]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = (24, 20)
In [81]:
import matplotlib.dates as mdates
In [93]:
test.ds.values
Out[93]:
array(['2021-12-31T00:00:00.000000000', '2021-12-31T01:00:00.000000000',
       '2021-12-31T02:00:00.000000000', ...,
       '2021-05-09T21:00:00.000000000', '2021-05-09T22:00:00.000000000',
       '2021-05-09T23:00:00.000000000'], dtype='datetime64[ns]')
In [2]:
from get_holiday_cn.client import getHoliday
In [20]:
client = getHoliday()
r = client.assemble_holiday_data(today='2016-01-01')
In [22]:
r
Out[22]:
{'code': 0,
 'type': {'type': 2, 'name': '周五', 'week': 5, 'status': 0},
 'holiday': {'holiday': True, 'name': '元旦', 'date': '2016-01-01'}}
In [15]:
import pandas as pd
In [23]:
def get_date_type(date, holiday_client):
    rst = holiday_client.assemble_holiday_data(today=date)
    if rst.get('code') == 0:
        if rst.get('holiday') is None:
            return 'oridinary'
        else:
            return rst.get('holiday').get('name')
In [27]:
def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"):
    """基于起止日期,将该时间段内的国内假期都找出来

    Args:
        start_date (str): 以"YYYY-MM-DD"形式的字符串, 默认2015-01-01
        end_date (_type_): 以"YYYY-MM-DD"形式的字符串默认2021-12-31

    Returns:
        _type_: _description_
    """
    holiday_dict = {}
    ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date'])
    ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d'))
    client = getHoliday()
    ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
    special_date = ds_list[ds_list.day_type  != 'simple'].copy()
    holiday_dict = special_date.groupby('day_type')['date'].apply(list).to_dict()
    return holiday_dict
In [34]:
ds_list = pd.DataFrame(pd.date_range(start='2018-01-01', end='2019-01-01', freq='D'), columns=['date'])

client = getHoliday()
ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
special_date = ds_list[ds_list.day_type!='oridinary'].copy()
In [37]:
special_date.groupby('day_type')['date'].apply(list).to_dict()
Out[37]:
{'中秋节': ['2018-09-24'],
 '元旦': ['2018-01-01', '2018-12-30', '2018-12-31', '2019-01-01'],
 '元旦调休': ['2018-12-29'],
 '劳动节': ['2018-04-29', '2018-04-30', '2018-05-01'],
 '劳动节调休': ['2018-04-28'],
 '国庆节': ['2018-10-01',
  '2018-10-02',
  '2018-10-03',
  '2018-10-04',
  '2018-10-05',
  '2018-10-06',
  '2018-10-07'],
 '国庆节调休': ['2018-09-29', '2018-09-30'],
 '春节': ['2018-02-15',
  '2018-02-16',
  '2018-02-17',
  '2018-02-18',
  '2018-02-19',
  '2018-02-20',
  '2018-02-21'],
 '春节调休': ['2018-02-11', '2018-02-24'],
 '清明节': ['2018-04-05', '2018-04-06', '2018-04-07'],
 '清明节调休': ['2018-04-08'],
 '端午节': ['2018-06-18']}
In [38]:
import holidays
import pandas as pd
import os
import numpy as np
from prophet import Prophet
import datetime as dt
from get_holiday_cn.client import getHoliday
from logzero import logger
import pickle


def concat_date(x:str, y:str):
    """_summary_

    Args:
        x (str): 年月日
        y (str): 小时

    Returns:
        _type_: 合成的时间
    """
    time_str = f"{x} {y}:00:00"
    return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")


def load_data():
    data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')]
    data_folder.sort()
    # 一个读取数据并合成成一个大文件的函数
    total_data = pd.DataFrame()
    for folder in data_folder:
        for file in os.listdir(f"./data/{folder}"):
            if file.endswith('csv'):
                data = pd.read_csv(f'./data/{folder}/{file}')
                use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy()
                total_data = pd.concat([total_data, use_data])
    total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1)
    total_data.ds = pd.to_datetime(total_data.ds)
    total_data.sort_values(by='ds', ascending=True)
    total_data.reset_index(drop=True, inplace=True)
    logger.info(f"总数据集大小:{total_data.shape}")
    return total_data


def build_model(city: str, data: pd.DataFrame, dtype:str, holiday_mode:dict, split_date="2021-01-01 00:00:00"):
    """_summary_

    Args:
        city (str): 城市
        data (pd.DataFrame): 数据
        dtype (str): O3还是PM2.5
        holiday_mode (dict): 假期字典
        split_date (str, optional): 划分训练测试的分割日期. Defaults to "2021-01-01".

    Returns:
        model: 模型
        forecast: 对该组数据的预测
    """
    logger.info(f"选择了 {city}{dtype} 数据,")
    use_data = data[(data['type']==dtype)][["ds", city]].copy()
    use_data.columns = ["ds", "y"]
    train_data = use_data[use_data.ds < split_date].copy()
    test_data = use_data[use_data.ds >= split_date].copy()
    model=Prophet(growth="linear",
              yearly_seasonality=True,
              weekly_seasonality=True,
              daily_seasonality=True,
              seasonality_mode="multiplicative",
              seasonality_prior_scale=12,
              holidays=holiday_mode
              )
    model.fit(train_data)
    forecast=model.predict(test_data)
    return model, forecast


def get_date_type(date:str, holiday_client:getHoliday):
    """一个判断某个日期是哪种假期的类

    Args:
        date (str): "YYYY-MM-DD"
        holiday_client (getHoliday): object of getHoliday class

    Returns:
        str: oridinary for simple day and others for special day
    """
    rst = holiday_client.assemble_holiday_data(today=date)
    if rst.get('code') == 0:
        if rst.get('holiday') is None:
            return 'oridinary'
        else:
            return rst.get('holiday').get('name')


def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"):
    """基于起止日期,将该时间段内的国内假期都找出来,包括本应该放假但是最后调休上班的

    Args:
        start_date (str): 以"YYYY-MM-DD"形式的字符串, 默认2015-01-01
        end_date (_type_): 以"YYYY-MM-DD"形式的字符串默认2021-12-31

    Returns:
        _type_: _description_
    """
    ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date'])
    ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d'))
    client = getHoliday()
    ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
    special_date = ds_list[ds_list.day_type  != 'simple'].copy()
    special_date.columns = ['ds', 'holiday']
    return special_date
In [39]:
data_type = 'O3'
city_list = ['北京', '天津']
if os.path.exists('./data/total_data.csv'):
    data = pd.read_csv('./data/total_data.csv')
else:
    data = load_data()
    data.to_csv('./data/total_data.csv', encoding='utf-8', index=False)
city_list = ['北京', '天津']
model_dict = dict()
predict_dict = dict()
holiday_data = build_holiday(data.ds.min(), data.ds.max())
In [40]:
holiday_data
Out[40]:
ds holiday
0 2015-01-02 元旦
1 2015-01-03 元旦
2 2015-01-04 元旦调休
3 2015-01-05 oridinary
4 2015-01-06 oridinary
... ... ...
2551 2021-12-27 oridinary
2552 2021-12-28 oridinary
2553 2021-12-29 oridinary
2554 2021-12-30 oridinary
2555 2021-12-31 oridinary

2556 rows × 2 columns

In [ ]:
for city in city_list:
    use_data = data[city]
    model, pred = build_model(city, use_data, data_type, holiday_data, '2021-01-01')
    model_dict[city] = model
    predict_dict[city] = pred
    logger.info(f"{city} 模型构建完成")
if not os.path.exists('./result/'):
    os.mkdir('./result/')
if not os.path.exists(f'./result/{data_type}/'):
    os.mkdir(f'./result/{data_type}')
if not os.path.exists(f'./result/{data_type}/model/'):
    os.mkdir(f'./result/{data_type}/model')
if not os.path.exists(f'./result/{data_type}/data/'):
    os.mkdir(f'./result/{data_type}/data/')
for city in predict_dict:
    city_pred = predict_dict.get(city)
    city_pred.to_csv(f'./result/{data_type}/data/{city}.csv', encoding='utf-8', index=False)
    logger.info(f"{city} 预测数据保存完成")
for city in model_dict:
    city_model = model_dict.get(city)
    with open(f'./result/{data_type}/model/{city}.pkl', 'wb') as fwb:
        pickle.dump(city_model, fwb)
        logger.info(f"{city} 模型保存完成")
In [1]:
import pandas as pd
In [2]:
d1 = pd.read_csv('./data/城市_20150101-20141231/china_cities_20150102.csv')
d2 = pd.read_csv('./data/城市_20150101-20141231/china_cities_20150103.csv')
In [5]:
d1 = d1[(d1['type']=='O3')|(d1['type']=='PM2.5')].copy()
d2 = d2[(d2['type']=='O3')|(d2['type']=='PM2.5')].copy()
In [6]:
pd.concat([d1, d2])
Out[6]:
date hour type 北京 天津 石家庄 唐山 秦皇岛 邯郸 保定 ... 阿克苏地区 克州 喀什地区 和田地区 伊犁哈萨克州 塔城地区 阿勒泰地区 石河子 五家渠 三沙
1 20150102 1 PM2.5 90.0 103.0 153.0 107.0 77.0 118.0 214.0 ... 32.0 108.0 160.0 146.0 146.0 40.0 23.0 97.0 19.0 NaN
9 20150102 1 O3 5.0 3.0 6.0 9.0 11.0 6.0 23.0 ... 17.0 53.0 15.0 23.0 8.0 79.0 44.0 6.0 14.0 NaN
16 20150102 2 PM2.5 83.0 108.0 173.0 112.0 83.0 129.0 242.0 ... 56.0 104.0 290.0 225.0 137.0 48.0 29.0 83.0 23.0 NaN
24 20150102 2 O3 9.0 3.0 5.0 11.0 14.0 7.0 24.0 ... 15.0 56.0 17.0 21.0 10.0 70.0 51.0 6.0 12.0 NaN
31 20150102 3 PM2.5 74.0 127.0 187.0 108.0 74.0 142.0 274.0 ... 41.0 109.0 221.0 176.0 129.0 36.0 15.0 98.0 29.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
324 20150103 21 O3 7.0 4.0 5.0 8.0 2.0 6.0 19.0 ... 10.0 45.0 20.0 21.0 9.0 85.0 40.0 4.0 12.0 NaN
331 20150103 22 PM2.5 200.0 357.0 400.0 233.0 202.0 183.0 460.0 ... 61.0 100.0 222.0 109.0 40.0 39.0 28.0 63.0 61.0 NaN
339 20150103 22 O3 7.0 4.0 5.0 9.0 4.0 7.0 20.0 ... 16.0 35.0 20.0 22.0 8.0 98.0 31.0 5.0 16.0 NaN
346 20150103 23 PM2.5 208.0 350.0 415.0 234.0 187.0 180.0 509.0 ... 46.0 83.0 327.0 92.0 34.0 31.0 49.0 48.0 81.0 NaN
354 20150103 23 O3 7.0 4.0 5.0 9.0 3.0 6.0 25.0 ... 21.0 49.0 18.0 27.0 9.0 92.0 43.0 5.0 7.0 NaN

94 rows × 370 columns

In [7]:
data = pd.read_csv('./data/total_data.csv')
In [1]:
import pandas as pd
In [9]:
from statistics import mean
import matplotlib.pyplot as plt
from sklearn.metrics import explained_variance_score,r2_score, median_absolute_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy import stats
import numpy as np
from matplotlib import rcParams
config = {"font.size": 16,"mathtext.fontset":'stix'}
rcParams.update(config)
In [3]:
data = pd.read_csv('./data/eval_data.csv')
In [4]:
x = data.values[:, 0]
y = data.values[:, 1]
In [30]:
def scatter_out_1(x, y, label, name): ## x,y为两个需要做对比分析的两个量。
    # ==========计算评价指标==========
    BIAS = mean(x - y)
    MSE = mean_squared_error(x, y)
    RMSE = np.power(MSE, 0.5)
    R2 = r2_score(x, y)
    MAE = mean_absolute_error(x, y)
    mape = mean_absolute_percentage_error(x, y) * 100
    EV = explained_variance_score(x, y)
    print('==========算法评价指标==========')
    print('Explained Variance(EV):', '%.3f' % (EV))
    print('Mean Absolute Error(MAE):', '%.3f' % (MAE))
    print('Mean squared error(MSE):', '%.3f' % (MSE))
    print('Root Mean Squard Error(RMSE):', '%.3f' % (RMSE))
    print('mean_absolute_percentage_error(MAPE):', '%.3f' % (mape))
    print('R_squared:', '%.3f' % (R2))
    # ===========Calculate the point density==========
    xy = np.vstack([x, y])
    z = stats.gaussian_kde(xy)(xy)
    # ===========Sort the points by density, so that the densest points are plotted last===========
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
    def best_fit_slope_and_intercept(xs, ys):
        m = (((mean(xs) * mean(ys)) - mean(xs * ys)) / ((mean(xs) * mean(xs)) - mean(xs * xs)))
        b = mean(ys) - m * mean(xs)
        return m, b
    m, b = best_fit_slope_and_intercept(x, y)
    regression_line = []
    for a in x:
        regression_line.append((m * a) + b)
    fig,ax=plt.subplots(figsize=(12,9),dpi=400)
    scatter=ax.scatter(x,y,marker='o',c=z*100,s=15,label='LST',cmap='Spectral_r')
    cbar=plt.colorbar(scatter,shrink=1,orientation='vertical',extend='both',pad=0.015,aspect=30,label='Frequency')
    min_value = min(min(x), min(y))
    max_value = max(max(x), max(y))

    plt.plot([min_value-5,max_value+5],[min_value-5,max_value+5],'black',lw=1.5)  # 画的1:1线线的颜色为black线宽为0.8
    plt.plot(x,regression_line,'red',lw=1.5)      # 预测与实测数据之间的回归线
    plt.axis([min_value-5,max_value+5,min_value-5,max_value+5])  # 设置线的范围
    plt.xlabel('Measured %s' % label)
    plt.ylabel('Retrived %s' % label)
    # plt.xticks(fontproperties='Times New Roman')
    # plt.yticks(fontproperties='Times New Roman')


    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.95), '$N=%.f$' % len(y)) # text的位置需要根据x,y的大小范围进行调整。
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.90), '$R^2=%.2f$' % R2)
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.85), '$RMSE=%.2f$' % RMSE)
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.80), '$MAE=%.2f$' % MAE)
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.75), '$MAPE=%.2f$%%' % mape)
    plt.xlim(min_value-5,max_value+5)                                  # 设置x坐标轴的显示范围
    plt.ylim(min_value-5,max_value+5)                                  # 设置y坐标轴的显示范围
    # file_name = name.split('(')[0].strip()
    plt.savefig(f'./figure/{name}.png',dpi=800,bbox_inches='tight',pad_inches=0)
    plt.show()
In [31]:
scatter_out_1(x, y, label='coal cost(t)', name='coal')
==========算法评价指标==========
Explained Variance(EV): 0.940
Mean Absolute Error(MAE): 14.998
Mean squared error(MSE): 736.604
Root Mean Squard Error(RMSE): 27.140
mean_absolute_percentage_error(MAPE): 9.027
R_squared: 0.940
No description has been provided for this image
In [ ]: