1.0 MiB
1.0 MiB
In [30]:
import pandas as pd import os import datetime as dt from prophet import Prophet
Importing plotly failed. Interactive plots will not work.
In [10]:
data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')] data_folder.sort()
In [12]:
data_folder
Out[12]:
['城市_20150101-20141231', '城市_20160101-20161231', '城市_20170101-20171231', '城市_20180101-20181231', '城市_20190101-20191231', '城市_20200101-20201231', '城市_20210101-20211231']
In [18]:
# 一个读取数据并合成成一个大文件的函数 total_data = pd.DataFrame() for folder in data__folder: for file in os.listdir(f"./data/{folder}"): if file.endswith('csv'): data = pd.read_csv(f'./data/{folder}/{file}') use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy() total_data = pd.concat([total_data, use_data])
In [19]:
total_data.shape
Out[19]:
(119419, 394)
In [20]:
total_data.head()
Out[20]:
date | hour | type | 北京 | 天津 | 石家庄 | 唐山 | 秦皇岛 | 邯郸 | 保定 | ... | 果洛藏族自治州 | 玉树藏族自治州 | 海西蒙古族藏族自治州 | 博尔塔拉蒙古自治州 | 克孜勒苏柯尔克孜自治州 | 兰州新区 | 赣江新区 | 儋州 | 雄安新区 | 西咸新区 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 20150504 | 0 | PM2.5 | 21.0 | 16.0 | 36.0 | 29.0 | 22.0 | 105.0 | 53.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | 20150504 | 0 | O3 | 70.0 | 57.0 | 17.0 | 68.0 | 52.0 | 47.0 | 61.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
16 | 20150504 | 1 | PM2.5 | 15.0 | 16.0 | 44.0 | 27.0 | 16.0 | 74.0 | 31.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
24 | 20150504 | 1 | O3 | 72.0 | 56.0 | 6.0 | 75.0 | 50.0 | 53.0 | 64.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
31 | 20150504 | 2 | PM2.5 | 15.0 | 19.0 | 62.0 | 25.0 | 12.0 | 57.0 | 22.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 394 columns
In [36]:
def concat_date(x, y): time_str = f"{x} {y}:00:00" return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S")
In [70]:
total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1) total_data.ds = pd.to_datetime(total_data.ds)
In [71]:
total_data.reset_index(drop=True, inplace=True)
In [72]:
PM25_data = total_data[total_data["type"]=='PM2.5'].reset_index(drop=True) O3_data = total_data[total_data["type"]=='O3'].reset_index(drop=True)
In [42]:
def build_model(city: str, data: pd.DataFrame, dtype:str, split_date="2021-01-01 00:00:00"): """_summary_ Args: city (str): _description_ data (pd.DataFrame): _description_ dtype (str): _description_ split_date (str, optional): _description_. Defaults to "2021-01-01 00:00:00". """ use_data = data[(data['type']==dtype)][["ds", city]].copy() use_data.columns = ["ds", "y"] train_data = use_data[use_data.ds < split_date].copy() test_data = use_data[use_data.ds >= split_date].copy() model=Prophet(growth="linear", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False, seasonality_mode="multiplicative", seasonality_prior_scale=12, ) model.fit(train) forecast=model.predict(test)
In [73]:
train, test = build_model('北京', total_data, dtype='PM2.5')
In [67]:
import matplotlib.pyplot as plt from matplotlib import rcParams rcParams['figure.figsize'] = (24, 20)
In [81]:
import matplotlib.dates as mdates
In [93]:
test.ds.values
Out[93]:
array(['2021-12-31T00:00:00.000000000', '2021-12-31T01:00:00.000000000', '2021-12-31T02:00:00.000000000', ..., '2021-05-09T21:00:00.000000000', '2021-05-09T22:00:00.000000000', '2021-05-09T23:00:00.000000000'], dtype='datetime64[ns]')
In [2]:
from get_holiday_cn.client import getHoliday
In [20]:
client = getHoliday() r = client.assemble_holiday_data(today='2016-01-01')
In [22]:
r
Out[22]:
{'code': 0, 'type': {'type': 2, 'name': '周五', 'week': 5, 'status': 0}, 'holiday': {'holiday': True, 'name': '元旦', 'date': '2016-01-01'}}
In [15]:
import pandas as pd
In [23]:
def get_date_type(date, holiday_client): rst = holiday_client.assemble_holiday_data(today=date) if rst.get('code') == 0: if rst.get('holiday') is None: return 'oridinary' else: return rst.get('holiday').get('name')
In [27]:
def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"): """基于起止日期,将该时间段内的国内假期都找出来 Args: start_date (str): 以"YYYY-MM-DD"形式的字符串, 默认2015-01-01 end_date (_type_): 以"YYYY-MM-DD"形式的字符串,默认2021-12-31 Returns: _type_: _description_ """ holiday_dict = {} ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date']) ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d')) client = getHoliday() ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client)) special_date = ds_list[ds_list.day_type != 'simple'].copy() holiday_dict = special_date.groupby('day_type')['date'].apply(list).to_dict() return holiday_dict
In [34]:
ds_list = pd.DataFrame(pd.date_range(start='2018-01-01', end='2019-01-01', freq='D'), columns=['date']) client = getHoliday() ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client)) special_date = ds_list[ds_list.day_type!='oridinary'].copy()
In [37]:
special_date.groupby('day_type')['date'].apply(list).to_dict()
Out[37]:
{'中秋节': ['2018-09-24'], '元旦': ['2018-01-01', '2018-12-30', '2018-12-31', '2019-01-01'], '元旦调休': ['2018-12-29'], '劳动节': ['2018-04-29', '2018-04-30', '2018-05-01'], '劳动节调休': ['2018-04-28'], '国庆节': ['2018-10-01', '2018-10-02', '2018-10-03', '2018-10-04', '2018-10-05', '2018-10-06', '2018-10-07'], '国庆节调休': ['2018-09-29', '2018-09-30'], '春节': ['2018-02-15', '2018-02-16', '2018-02-17', '2018-02-18', '2018-02-19', '2018-02-20', '2018-02-21'], '春节调休': ['2018-02-11', '2018-02-24'], '清明节': ['2018-04-05', '2018-04-06', '2018-04-07'], '清明节调休': ['2018-04-08'], '端午节': ['2018-06-18']}
In [38]:
import holidays import pandas as pd import os import numpy as np from prophet import Prophet import datetime as dt from get_holiday_cn.client import getHoliday from logzero import logger import pickle def concat_date(x:str, y:str): """_summary_ Args: x (str): 年月日 y (str): 小时 Returns: _type_: 合成的时间 """ time_str = f"{x} {y}:00:00" return dt.datetime.strptime(time_str, "%Y%m%d %H:%M:%S") def load_data(): data_folder = [x for x in os.listdir('./data/') if x.startswith('城市_')] data_folder.sort() # 一个读取数据并合成成一个大文件的函数 total_data = pd.DataFrame() for folder in data_folder: for file in os.listdir(f"./data/{folder}"): if file.endswith('csv'): data = pd.read_csv(f'./data/{folder}/{file}') use_data = data[(data['type']=='PM2.5')|(data['type']=='O3')].copy() total_data = pd.concat([total_data, use_data]) total_data['ds'] = total_data.apply(lambda x: concat_date(x.date, x.hour), axis=1) total_data.ds = pd.to_datetime(total_data.ds) total_data.sort_values(by='ds', ascending=True) total_data.reset_index(drop=True, inplace=True) logger.info(f"总数据集大小:{total_data.shape}") return total_data def build_model(city: str, data: pd.DataFrame, dtype:str, holiday_mode:dict, split_date="2021-01-01 00:00:00"): """_summary_ Args: city (str): 城市 data (pd.DataFrame): 数据 dtype (str): O3还是PM2.5 holiday_mode (dict): 假期字典 split_date (str, optional): 划分训练测试的分割日期. Defaults to "2021-01-01". Returns: model: 模型 forecast: 对该组数据的预测 """ logger.info(f"选择了 {city} 的 {dtype} 数据,") use_data = data[(data['type']==dtype)][["ds", city]].copy() use_data.columns = ["ds", "y"] train_data = use_data[use_data.ds < split_date].copy() test_data = use_data[use_data.ds >= split_date].copy() model=Prophet(growth="linear", yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True, seasonality_mode="multiplicative", seasonality_prior_scale=12, holidays=holiday_mode ) model.fit(train_data) forecast=model.predict(test_data) return model, forecast def get_date_type(date:str, holiday_client:getHoliday): """一个判断某个日期是哪种假期的类 Args: date (str): "YYYY-MM-DD" holiday_client (getHoliday): object of getHoliday class Returns: str: oridinary for simple day and others for special day """ rst = holiday_client.assemble_holiday_data(today=date) if rst.get('code') == 0: if rst.get('holiday') is None: return 'oridinary' else: return rst.get('holiday').get('name') def build_holiday(start_date:str="2015-01-01", end_date:str="2021-12-31"): """基于起止日期,将该时间段内的国内假期都找出来,包括本应该放假但是最后调休上班的 Args: start_date (str): 以"YYYY-MM-DD"形式的字符串, 默认2015-01-01 end_date (_type_): 以"YYYY-MM-DD"形式的字符串,默认2021-12-31 Returns: _type_: _description_ """ ds_list = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='D'), columns=['date']) ds_list.date = ds_list.date.apply(lambda x: dt.datetime.strftime(x, format='%Y-%m-%d')) client = getHoliday() ds_list['day_type'] = ds_list.date.apply(lambda x: get_date_type(x, client)) special_date = ds_list[ds_list.day_type != 'simple'].copy() special_date.columns = ['ds', 'holiday'] return special_date
In [39]:
data_type = 'O3' city_list = ['北京', '天津'] if os.path.exists('./data/total_data.csv'): data = pd.read_csv('./data/total_data.csv') else: data = load_data() data.to_csv('./data/total_data.csv', encoding='utf-8', index=False) city_list = ['北京', '天津'] model_dict = dict() predict_dict = dict() holiday_data = build_holiday(data.ds.min(), data.ds.max())
In [40]:
holiday_data
Out[40]:
ds | holiday | |
---|---|---|
0 | 2015-01-02 | 元旦 |
1 | 2015-01-03 | 元旦 |
2 | 2015-01-04 | 元旦调休 |
3 | 2015-01-05 | oridinary |
4 | 2015-01-06 | oridinary |
... | ... | ... |
2551 | 2021-12-27 | oridinary |
2552 | 2021-12-28 | oridinary |
2553 | 2021-12-29 | oridinary |
2554 | 2021-12-30 | oridinary |
2555 | 2021-12-31 | oridinary |
2556 rows × 2 columns
In [ ]:
for city in city_list: use_data = data[city] model, pred = build_model(city, use_data, data_type, holiday_data, '2021-01-01') model_dict[city] = model predict_dict[city] = pred logger.info(f"{city} 模型构建完成") if not os.path.exists('./result/'): os.mkdir('./result/') if not os.path.exists(f'./result/{data_type}/'): os.mkdir(f'./result/{data_type}') if not os.path.exists(f'./result/{data_type}/model/'): os.mkdir(f'./result/{data_type}/model') if not os.path.exists(f'./result/{data_type}/data/'): os.mkdir(f'./result/{data_type}/data/') for city in predict_dict: city_pred = predict_dict.get(city) city_pred.to_csv(f'./result/{data_type}/data/{city}.csv', encoding='utf-8', index=False) logger.info(f"{city} 预测数据保存完成") for city in model_dict: city_model = model_dict.get(city) with open(f'./result/{data_type}/model/{city}.pkl', 'wb') as fwb: pickle.dump(city_model, fwb) logger.info(f"{city} 模型保存完成")
In [1]:
import pandas as pd
In [2]:
d1 = pd.read_csv('./data/城市_20150101-20141231/china_cities_20150102.csv') d2 = pd.read_csv('./data/城市_20150101-20141231/china_cities_20150103.csv')
In [5]:
d1 = d1[(d1['type']=='O3')|(d1['type']=='PM2.5')].copy() d2 = d2[(d2['type']=='O3')|(d2['type']=='PM2.5')].copy()
In [6]:
pd.concat([d1, d2])
Out[6]:
date | hour | type | 北京 | 天津 | 石家庄 | 唐山 | 秦皇岛 | 邯郸 | 保定 | ... | 阿克苏地区 | 克州 | 喀什地区 | 和田地区 | 伊犁哈萨克州 | 塔城地区 | 阿勒泰地区 | 石河子 | 五家渠 | 三沙 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 20150102 | 1 | PM2.5 | 90.0 | 103.0 | 153.0 | 107.0 | 77.0 | 118.0 | 214.0 | ... | 32.0 | 108.0 | 160.0 | 146.0 | 146.0 | 40.0 | 23.0 | 97.0 | 19.0 | NaN |
9 | 20150102 | 1 | O3 | 5.0 | 3.0 | 6.0 | 9.0 | 11.0 | 6.0 | 23.0 | ... | 17.0 | 53.0 | 15.0 | 23.0 | 8.0 | 79.0 | 44.0 | 6.0 | 14.0 | NaN |
16 | 20150102 | 2 | PM2.5 | 83.0 | 108.0 | 173.0 | 112.0 | 83.0 | 129.0 | 242.0 | ... | 56.0 | 104.0 | 290.0 | 225.0 | 137.0 | 48.0 | 29.0 | 83.0 | 23.0 | NaN |
24 | 20150102 | 2 | O3 | 9.0 | 3.0 | 5.0 | 11.0 | 14.0 | 7.0 | 24.0 | ... | 15.0 | 56.0 | 17.0 | 21.0 | 10.0 | 70.0 | 51.0 | 6.0 | 12.0 | NaN |
31 | 20150102 | 3 | PM2.5 | 74.0 | 127.0 | 187.0 | 108.0 | 74.0 | 142.0 | 274.0 | ... | 41.0 | 109.0 | 221.0 | 176.0 | 129.0 | 36.0 | 15.0 | 98.0 | 29.0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
324 | 20150103 | 21 | O3 | 7.0 | 4.0 | 5.0 | 8.0 | 2.0 | 6.0 | 19.0 | ... | 10.0 | 45.0 | 20.0 | 21.0 | 9.0 | 85.0 | 40.0 | 4.0 | 12.0 | NaN |
331 | 20150103 | 22 | PM2.5 | 200.0 | 357.0 | 400.0 | 233.0 | 202.0 | 183.0 | 460.0 | ... | 61.0 | 100.0 | 222.0 | 109.0 | 40.0 | 39.0 | 28.0 | 63.0 | 61.0 | NaN |
339 | 20150103 | 22 | O3 | 7.0 | 4.0 | 5.0 | 9.0 | 4.0 | 7.0 | 20.0 | ... | 16.0 | 35.0 | 20.0 | 22.0 | 8.0 | 98.0 | 31.0 | 5.0 | 16.0 | NaN |
346 | 20150103 | 23 | PM2.5 | 208.0 | 350.0 | 415.0 | 234.0 | 187.0 | 180.0 | 509.0 | ... | 46.0 | 83.0 | 327.0 | 92.0 | 34.0 | 31.0 | 49.0 | 48.0 | 81.0 | NaN |
354 | 20150103 | 23 | O3 | 7.0 | 4.0 | 5.0 | 9.0 | 3.0 | 6.0 | 25.0 | ... | 21.0 | 49.0 | 18.0 | 27.0 | 9.0 | 92.0 | 43.0 | 5.0 | 7.0 | NaN |
94 rows × 370 columns
In [7]:
data = pd.read_csv('./data/total_data.csv')
In [1]:
import pandas as pd
In [9]:
from statistics import mean import matplotlib.pyplot as plt from sklearn.metrics import explained_variance_score,r2_score, median_absolute_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error from scipy import stats import numpy as np from matplotlib import rcParams config = {"font.size": 16,"mathtext.fontset":'stix'} rcParams.update(config)
In [3]:
data = pd.read_csv('./data/eval_data.csv')
In [4]:
x = data.values[:, 0] y = data.values[:, 1]
In [30]:
def scatter_out_1(x, y, label, name): ## x,y为两个需要做对比分析的两个量。 # ==========计算评价指标========== BIAS = mean(x - y) MSE = mean_squared_error(x, y) RMSE = np.power(MSE, 0.5) R2 = r2_score(x, y) MAE = mean_absolute_error(x, y) mape = mean_absolute_percentage_error(x, y) * 100 EV = explained_variance_score(x, y) print('==========算法评价指标==========') print('Explained Variance(EV):', '%.3f' % (EV)) print('Mean Absolute Error(MAE):', '%.3f' % (MAE)) print('Mean squared error(MSE):', '%.3f' % (MSE)) print('Root Mean Squard Error(RMSE):', '%.3f' % (RMSE)) print('mean_absolute_percentage_error(MAPE):', '%.3f' % (mape)) print('R_squared:', '%.3f' % (R2)) # ===========Calculate the point density========== xy = np.vstack([x, y]) z = stats.gaussian_kde(xy)(xy) # ===========Sort the points by density, so that the densest points are plotted last=========== idx = z.argsort() x, y, z = x[idx], y[idx], z[idx] def best_fit_slope_and_intercept(xs, ys): m = (((mean(xs) * mean(ys)) - mean(xs * ys)) / ((mean(xs) * mean(xs)) - mean(xs * xs))) b = mean(ys) - m * mean(xs) return m, b m, b = best_fit_slope_and_intercept(x, y) regression_line = [] for a in x: regression_line.append((m * a) + b) fig,ax=plt.subplots(figsize=(12,9),dpi=400) scatter=ax.scatter(x,y,marker='o',c=z*100,s=15,label='LST',cmap='Spectral_r') cbar=plt.colorbar(scatter,shrink=1,orientation='vertical',extend='both',pad=0.015,aspect=30,label='Frequency') min_value = min(min(x), min(y)) max_value = max(max(x), max(y)) plt.plot([min_value-5,max_value+5],[min_value-5,max_value+5],'black',lw=1.5) # 画的1:1线,线的颜色为black,线宽为0.8 plt.plot(x,regression_line,'red',lw=1.5) # 预测与实测数据之间的回归线 plt.axis([min_value-5,max_value+5,min_value-5,max_value+5]) # 设置线的范围 plt.xlabel('Measured %s' % label) plt.ylabel('Retrived %s' % label) # plt.xticks(fontproperties='Times New Roman') # plt.yticks(fontproperties='Times New Roman') plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.95), '$N=%.f$' % len(y)) # text的位置需要根据x,y的大小范围进行调整。 plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.90), '$R^2=%.2f$' % R2) plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.85), '$RMSE=%.2f$' % RMSE) plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.80), '$MAE=%.2f$' % MAE) plt.text(min_value-5 + (max_value-min_value) * 0.05, int(max_value * 0.75), '$MAPE=%.2f$%%' % mape) plt.xlim(min_value-5,max_value+5) # 设置x坐标轴的显示范围 plt.ylim(min_value-5,max_value+5) # 设置y坐标轴的显示范围 # file_name = name.split('(')[0].strip() plt.savefig(f'./figure/{name}.png',dpi=800,bbox_inches='tight',pad_inches=0) plt.show()
In [31]:
scatter_out_1(x, y, label='coal cost(t)', name='coal')
==========算法评价指标========== Explained Variance(EV): 0.940 Mean Absolute Error(MAE): 14.998 Mean squared error(MSE): 736.604 Root Mean Squard Error(RMSE): 27.140 mean_absolute_percentage_error(MAPE): 9.027 R_squared: 0.940
In [ ]: