22-T67/test.ipynb at master

1.0 MiB

Raw Permalink Blame History

In [30]:

import pandas as pd
import os
import datetime as dt
from prophet import Prophet

Importing plotly failed. Interactive plots will not work.

In [10]:

data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')]
data_folder.sort()

In [12]:

data_folder

Out[12]:

['城市_20150101-20141231',
 '城市_20160101-20161231',
 '城市_20170101-20171231',
 '城市_20180101-20181231',
 '城市_20190101-20191231',
 '城市_20200101-20201231',
 '城市_20210101-20211231']

In [18]:

# 一个读取数据并合成成一个大文件的函数
total_data = pd.DataFrame()
for folder in data__folder:
    for file in os.listdir(f"./data/{folder}"):
        if file.endswith('csv'):
            data = pd.read_csv(f'./data/{folder}/{file}')
            use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy()
            total_data = pd.concat([total_data, use_data])

In [19]:

total_data.shape

Out[19]:

(119419, 394)

In [20]:

total_data.head()

Out[20]:

	date	hour	type	北京	天津	石家庄	唐山	秦皇岛	邯郸	保定	...	果洛藏族自治州	玉树藏族自治州	海西蒙古族藏族自治州	博尔塔拉蒙古自治州	克孜勒苏柯尔克孜自治州	兰州新区	赣江新区	儋州	雄安新区	西咸新区
1	20150504	0	PM2.5	21.0	16.0	36.0	29.0	22.0	105.0	53.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	20150504	0	O3	70.0	57.0	17.0	68.0	52.0	47.0	61.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16	20150504	1	PM2.5	15.0	16.0	44.0	27.0	16.0	74.0	31.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
24	20150504	1	O3	72.0	56.0	6.0	75.0	50.0	53.0	64.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
31	20150504	2	PM2.5	15.0	19.0	62.0	25.0	12.0	57.0	22.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 394 columns

In [36]:

def concat_date(x, y):
    time_str = f"{x} {y}:00:00"
    return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")

In [70]:

total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1)
total_data.ds = pd.to_datetime(total_data.ds)

In [71]:

total_data.reset_index(drop=True, inplace=True)

In [72]:

PM25_data = total_data[total_data["type"]=='PM2.5'].reset_index(drop=True)
O3_data = total_data[total_data["type"]=='O3'].reset_index(drop=True)

In [42]:

def build_model(city: str, data: pd.DataFrame, dtype:str, split_date="2021-01-01 00:00:00"):
    """_summary_

    Args:
        city (str): _description_
        data (pd.DataFrame): _description_
        dtype (str): _description_
        split_date (str, optional): _description_. Defaults to "2021-01-01 00:00:00".
    """
    use_data = data[(data['type']==dtype)][["ds", city]].copy()
    use_data.columns = ["ds", "y"]
    train_data = use_data[use_data.ds < split_date].copy()
    test_data = use_data[use_data.ds >= split_date].copy()
    model=Prophet(growth="linear",
              yearly_seasonality=True,
              weekly_seasonality=False,
              daily_seasonality=False,
              seasonality_mode="multiplicative",
              seasonality_prior_scale=12,
              )
    model.fit(train)
    forecast=model.predict(test)

In [73]:

train, test = build_model('北京', total_data, dtype='PM2.5')

In [67]:

import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = (24, 20)

In [81]:

import matplotlib.dates as mdates

In [93]:

test.ds.values

Out[93]:

array(['2021-12-31T00:00:00.000000000', '2021-12-31T01:00:00.000000000',
       '2021-12-31T02:00:00.000000000', ...,
       '2021-05-09T21:00:00.000000000', '2021-05-09T22:00:00.000000000',
       '2021-05-09T23:00:00.000000000'], dtype='datetime64[ns]')

In [2]:

from get_holiday_cn.client import getHoliday

In [20]:

client = getHoliday()
r = client.assemble_holiday_data(today='2016-01-01')

In [22]:

Out[22]:

{'code': 0,
 'type': {'type': 2, 'name': '周五', 'week': 5, 'status': 0},
 'holiday': {'holiday': True, 'name': '元旦', 'date': '2016-01-01'}}

In [15]:

import pandas as pd

In [23]:

def get_date_type(date, holiday_client):
    rst = holiday_client.assemble_holiday_data(today=date)
    if rst.get('code') == 0:
        if rst.get('holiday') is None:
            return 'oridinary'
        else:
            return rst.get('holiday').get('name')

In [27]:

def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"):
    """基于起止日期,将该时间段内的国内假期都找出来

    Args:
        start_date (str): 以"YYYY-MM-DD"形式的字符串， 默认2015-01-01
        end_date (_type_): 以"YYYY-MM-DD"形式的字符串，默认2021-12-31

    Returns:
        _type_: _description_
    """
    holiday_dict = {}
    ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date'])
    ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d'))
    client = getHoliday()
    ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
    special_date = ds_list[ds_list.day_type  != 'simple'].copy()
    holiday_dict = special_date.groupby('day_type')['date'].apply(list).to_dict()
    return holiday_dict

In [34]:

ds_list = pd.DataFrame(pd.date_range(start='2018-01-01', end='2019-01-01', freq='D'), columns=['date'])

client = getHoliday()
ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
special_date = ds_list[ds_list.day_type!='oridinary'].copy()

In [37]:

special_date.groupby('day_type')['date'].apply(list).to_dict()

Out[37]:

{'中秋节': ['2018-09-24'],
 '元旦': ['2018-01-01', '2018-12-30', '2018-12-31', '2019-01-01'],
 '元旦调休': ['2018-12-29'],
 '劳动节': ['2018-04-29', '2018-04-30', '2018-05-01'],
 '劳动节调休': ['2018-04-28'],
 '国庆节': ['2018-10-01',
  '2018-10-02',
  '2018-10-03',
  '2018-10-04',
  '2018-10-05',
  '2018-10-06',
  '2018-10-07'],
 '国庆节调休': ['2018-09-29', '2018-09-30'],
 '春节': ['2018-02-15',
  '2018-02-16',
  '2018-02-17',
  '2018-02-18',
  '2018-02-19',
  '2018-02-20',
  '2018-02-21'],
 '春节调休': ['2018-02-11', '2018-02-24'],
 '清明节': ['2018-04-05', '2018-04-06', '2018-04-07'],
 '清明节调休': ['2018-04-08'],
 '端午节': ['2018-06-18']}

In [38]:

import holidays
import pandas as pd
import os
import numpy as np
from prophet import Prophet
import datetime as dt
from get_holiday_cn.client import getHoliday
from logzero import logger
import pickle


def concat_date(x:str, y:str):
    """_summary_

    Args:
        x (str): 年月日
        y (str): 小时

    Returns:
        _type_: 合成的时间
    """
    time_str = f"{x} {y}:00:00"
    return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")


def load_data():
    data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')]
    data_folder.sort()
    # 一个读取数据并合成成一个大文件的函数
    total_data = pd.DataFrame()
    for folder in data_folder:
        for file in os.listdir(f"./data/{folder}"):
            if file.endswith('csv'):
                data = pd.read_csv(f'./data/{folder}/{file}')
                use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy()
                total_data = pd.concat([total_data, use_data])
    total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1)
    total_data.ds = pd.to_datetime(total_data.ds)
    total_data.sort_values(by='ds', ascending=True)
    total_data.reset_index(drop=True, inplace=True)
    logger.info(f"总数据集大小：{total_data.shape}")
    return total_data


def build_model(city: str, data: pd.DataFrame, dtype:str, holiday_mode:dict, split_date="2021-01-01 00:00:00"):
    """_summary_

    Args:
        city (str): 城市
        data (pd.DataFrame): 数据
        dtype (str): O3还是PM2.5
        holiday_mode (dict): 假期字典
        split_date (str, optional): 划分训练测试的分割日期. Defaults to "2021-01-01".

    Returns:
        model: 模型
        forecast: 对该组数据的预测
    """
    logger.info(f"选择了 {city} 的 {dtype} 数据，")
    use_data = data[(data['type']==dtype)][["ds", city]].copy()
    use_data.columns = ["ds", "y"]
    train_data = use_data[use_data.ds < split_date].copy()
    test_data = use_data[use_data.ds >= split_date].copy()
    model=Prophet(growth="linear",
              yearly_seasonality=True,
              weekly_seasonality=True,
              daily_seasonality=True,
              seasonality_mode="multiplicative",
              seasonality_prior_scale=12,
              holidays=holiday_mode
              )
    model.fit(train_data)
    forecast=model.predict(test_data)
    return model, forecast


def get_date_type(date:str, holiday_client:getHoliday):
    """一个判断某个日期是哪种假期的类

    Args:
        date (str): "YYYY-MM-DD"
        holiday_client (getHoliday): object of getHoliday class

    Returns:
        str: oridinary for simple day and others for special day
    """
    rst = holiday_client.assemble_holiday_data(today=date)
    if rst.get('code') == 0:
        if rst.get('holiday') is None:
            return 'oridinary'
        else:
            return rst.get('holiday').get('name')


def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"):
    """基于起止日期,将该时间段内的国内假期都找出来，包括本应该放假但是最后调休上班的

    Args:
        start_date (str): 以"YYYY-MM-DD"形式的字符串， 默认2015-01-01
        end_date (_type_): 以"YYYY-MM-DD"形式的字符串，默认2021-12-31

    Returns:
        _type_: _description_
    """
    ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date'])
    ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d'))
    client = getHoliday()
    ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client))
    special_date = ds_list[ds_list.day_type  != 'simple'].copy()
    special_date.columns = ['ds', 'holiday']
    return special_date

In [39]:

data_type = 'O3'
city_list = ['北京', '天津']
if os.path.exists('./data/total_data.csv'):
    data = pd.read_csv('./data/total_data.csv')
else:
    data = load_data()
    data.to_csv('./data/total_data.csv', encoding='utf-8', index=False)
city_list = ['北京', '天津']
model_dict = dict()
predict_dict = dict()
holiday_data = build_holiday(data.ds.min(), data.ds.max())

In [40]:

holiday_data

Out[40]:

	ds	holiday
0	2015-01-02	元旦
1	2015-01-03	元旦
2	2015-01-04	元旦调休
3	2015-01-05	oridinary
4	2015-01-06	oridinary
...	...	...
2551	2021-12-27	oridinary
2552	2021-12-28	oridinary
2553	2021-12-29	oridinary
2554	2021-12-30	oridinary
2555	2021-12-31	oridinary

2556 rows × 2 columns

In [ ]:

for city in city_list:
    use_data = data[city]
    model, pred = build_model(city, use_data, data_type, holiday_data, '2021-01-01')
    model_dict[city] = model
    predict_dict[city] = pred
    logger.info(f"{city} 模型构建完成")
if not os.path.exists('./result/'):
    os.mkdir('./result/')
if not os.path.exists(f'./result/{data_type}/'):
    os.mkdir(f'./result/{data_type}')
if not os.path.exists(f'./result/{data_type}/model/'):
    os.mkdir(f'./result/{data_type}/model')
if not os.path.exists(f'./result/{data_type}/data/'):
    os.mkdir(f'./result/{data_type}/data/')
for city in predict_dict:
    city_pred = predict_dict.get(city)
    city_pred.to_csv(f'./result/{data_type}/data/{city}.csv', encoding='utf-8', index=False)
    logger.info(f"{city} 预测数据保存完成")
for city in model_dict:
    city_model = model_dict.get(city)
    with open(f'./result/{data_type}/model/{city}.pkl', 'wb') as fwb:
        pickle.dump(city_model, fwb)
        logger.info(f"{city} 模型保存完成")

In [1]:

import pandas as pd

In [2]:

d1 = pd.read_csv('./data/城市_20150101-20141231/china_cities_20150102.csv')
d2 = pd.read_csv('./data/城市_20150101-20141231/china_cities_20150103.csv')

In [5]:

d1 = d1[(d1['type']=='O3')|(d1['type']=='PM2.5')].copy()
d2 = d2[(d2['type']=='O3')|(d2['type']=='PM2.5')].copy()

In [6]:

pd.concat([d1, d2])

Out[6]:

	date	hour	type	北京	天津	石家庄	唐山	秦皇岛	邯郸	保定	...	阿克苏地区	克州	喀什地区	和田地区	伊犁哈萨克州	塔城地区	阿勒泰地区	石河子	五家渠	三沙
1	20150102	1	PM2.5	90.0	103.0	153.0	107.0	77.0	118.0	214.0	...	32.0	108.0	160.0	146.0	146.0	40.0	23.0	97.0	19.0	NaN
9	20150102	1	O3	5.0	3.0	6.0	9.0	11.0	6.0	23.0	...	17.0	53.0	15.0	23.0	8.0	79.0	44.0	6.0	14.0	NaN
16	20150102	2	PM2.5	83.0	108.0	173.0	112.0	83.0	129.0	242.0	...	56.0	104.0	290.0	225.0	137.0	48.0	29.0	83.0	23.0	NaN
24	20150102	2	O3	9.0	3.0	5.0	11.0	14.0	7.0	24.0	...	15.0	56.0	17.0	21.0	10.0	70.0	51.0	6.0	12.0	NaN
31	20150102	3	PM2.5	74.0	127.0	187.0	108.0	74.0	142.0	274.0	...	41.0	109.0	221.0	176.0	129.0	36.0	15.0	98.0	29.0	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
324	20150103	21	O3	7.0	4.0	5.0	8.0	2.0	6.0	19.0	...	10.0	45.0	20.0	21.0	9.0	85.0	40.0	4.0	12.0	NaN
331	20150103	22	PM2.5	200.0	357.0	400.0	233.0	202.0	183.0	460.0	...	61.0	100.0	222.0	109.0	40.0	39.0	28.0	63.0	61.0	NaN
339	20150103	22	O3	7.0	4.0	5.0	9.0	4.0	7.0	20.0	...	16.0	35.0	20.0	22.0	8.0	98.0	31.0	5.0	16.0	NaN
346	20150103	23	PM2.5	208.0	350.0	415.0	234.0	187.0	180.0	509.0	...	46.0	83.0	327.0	92.0	34.0	31.0	49.0	48.0	81.0	NaN
354	20150103	23	O3	7.0	4.0	5.0	9.0	3.0	6.0	25.0	...	21.0	49.0	18.0	27.0	9.0	92.0	43.0	5.0	7.0	NaN

94 rows × 370 columns

In [7]:

data = pd.read_csv('./data/total_data.csv')

In [1]:

import pandas as pd

In [9]:

from statistics import mean
import matplotlib.pyplot as plt
from sklearn.metrics import explained_variance_score,r2_score, median_absolute_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy import stats
import numpy as np
from matplotlib import rcParams
config = {"font.size": 16,"mathtext.fontset":'stix'}
rcParams.update(config)

In [3]:

data = pd.read_csv('./data/eval_data.csv')

In [4]:

x = data.values[:, 0]
y = data.values[:, 1]

In [30]:

def scatter_out_1(x, y, label, name): ## x,y为两个需要做对比分析的两个量。
    # ==========计算评价指标==========
    BIAS = mean(x - y)
    MSE = mean_squared_error(x, y)
    RMSE = np.power(MSE, 0.5)
    R2 = r2_score(x, y)
    MAE = mean_absolute_error(x, y)
    mape = mean_absolute_percentage_error(x, y) * 100
    EV = explained_variance_score(x, y)
    print('==========算法评价指标==========')
    print('Explained Variance(EV):', '%.3f' % (EV))
    print('Mean Absolute Error(MAE):', '%.3f' % (MAE))
    print('Mean squared error(MSE):', '%.3f' % (MSE))
    print('Root Mean Squard Error(RMSE):', '%.3f' % (RMSE))
    print('mean_absolute_percentage_error(MAPE):', '%.3f' % (mape))
    print('R_squared:', '%.3f' % (R2))
    # ===========Calculate the point density==========
    xy = np.vstack([x, y])
    z = stats.gaussian_kde(xy)(xy)
    # ===========Sort the points by density, so that the densest points are plotted last===========
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
    def best_fit_slope_and_intercept(xs, ys):
        m = (((mean(xs) * mean(ys)) - mean(xs * ys)) / ((mean(xs) * mean(xs)) - mean(xs * xs)))
        b = mean(ys) - m * mean(xs)
        return m, b
    m, b = best_fit_slope_and_intercept(x, y)
    regression_line = []
    for a in x:
        regression_line.append((m * a) + b)
    fig,ax=plt.subplots(figsize=(12,9),dpi=400)
    scatter=ax.scatter(x,y,marker='o',c=z*100,s=15,label='LST',cmap='Spectral_r')
    cbar=plt.colorbar(scatter,shrink=1,orientation='vertical',extend='both',pad=0.015,aspect=30,label='Frequency')
    min_value = min(min(x), min(y))
    max_value = max(max(x), max(y))

    plt.plot([min_value-5,max_value+5],[min_value-5,max_value+5],'black',lw=1.5)  # 画的1:1线，线的颜色为black，线宽为0.8
    plt.plot(x,regression_line,'red',lw=1.5)      # 预测与实测数据之间的回归线
    plt.axis([min_value-5,max_value+5,min_value-5,max_value+5])  # 设置线的范围
    plt.xlabel('Measured %s' % label)
    plt.ylabel('Retrived %s' % label)
    # plt.xticks(fontproperties='Times New Roman')
    # plt.yticks(fontproperties='Times New Roman')


    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.95), '$N=%.f$' % len(y)) # text的位置需要根据x,y的大小范围进行调整。
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.90), '$R^2=%.2f$' % R2)
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.85), '$RMSE=%.2f$' % RMSE)
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.80), '$MAE=%.2f$' % MAE)
    plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.75), '$MAPE=%.2f$%%' % mape)
    plt.xlim(min_value-5,max_value+5)                                  # 设置x坐标轴的显示范围
    plt.ylim(min_value-5,max_value+5)                                  # 设置y坐标轴的显示范围
    # file_name = name.split('(')[0].strip()
    plt.savefig(f'./figure/{name}.png',dpi=800,bbox_inches='tight',pad_inches=0)
    plt.show()

In [31]:

scatter_out_1(x, y, label='coal cost(t)', name='coal')

==========算法评价指标==========
Explained Variance(EV): 0.940
Mean Absolute Error(MAE): 14.998
Mean squared error(MSE): 736.604
Root Mean Squard Error(RMSE): 27.140
mean_absolute_percentage_error(MAPE): 9.027
R_squared: 0.940

No description has been provided for this image

In [ ]:

1.0 MiB Raw Permalink Blame History Unescape Escape

1.0 MiB

Raw Permalink Blame History