{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import xgboost as xgb\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": " 所处地区 机组类型 参数分类 冷凝器型式 铭牌容量 (MW) longitude latitude altitude \\\n0 上海市 供热式 亚临界 水冷 5.707110 4.807875 3.467769 1.386294 \n1 上海市 凝气式 亚临界 水冷 5.707110 4.807875 3.467769 1.386294 \n2 上海市 凝气式 亚临界 水冷 5.771441 4.808939 3.476886 1.098612 \n3 上海市 凝气式 超超临界 水冷 6.908755 4.807356 3.458373 1.609438 \n4 上海市 纯凝式 亚临界 水冷 5.860786 4.807839 3.478627 2.833213 \n\n power_co2_factor heat_co2_factor \n0 0.574332 0.072680 \n1 0.582164 0.072391 \n2 0.569281 0.071041 \n3 0.506250 0.070460 \n4 0.565226 0.073717 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
所处地区机组类型参数分类冷凝器型式铭牌容量 (MW)longitudelatitudealtitudepower_co2_factorheat_co2_factor
0上海市供热式亚临界水冷5.7071104.8078753.4677691.3862940.5743320.072680
1上海市凝气式亚临界水冷5.7071104.8078753.4677691.3862940.5821640.072391
2上海市凝气式亚临界水冷5.7714414.8089393.4768861.0986120.5692810.071041
3上海市凝气式超超临界水冷6.9087554.8073563.4583731.6094380.5062500.070460
4上海市纯凝式亚临界水冷5.8607864.8078393.4786272.8332130.5652260.073717
\n
" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('./results/去煤种化数据.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": "(['所处地区', '机组类型', '参数分类', '冷凝器型式'],\n Index(['铭牌容量 (MW)', 'longitude', 'latitude', 'altitude'], dtype='object'))" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "object_cols = data.columns[:4].tolist()\n", "num_cols = data.columns[4:8]\n", "object_cols, num_cols" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_data = pd.read_excel('./data/煤电机组情况(含企业名称).xlsx')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_geo_info = pd.read_excel('./data/电厂地理信息.xlsx')\n", "test_geo_info.rename(columns={'name':'企业名称'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_data = test_data.merge(test_geo_info, how='left', on='企业名称').drop(columns='address')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_data_cp = test_data.copy()\n", "test_data = test_data[['地区', '汽轮机类型', '压力参数', '冷却方式', '单机容量(MW)', 'lat', 'lng', 'altitude']].copy()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_data.columns = data.columns[:8].tolist()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_data['na_cols'] = test_data.isna().sum(axis=1).values" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "test_data = test_data[test_data['铭牌容量 (MW)']>=30].copy()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": "0.965160147200342" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data[test_data.na_cols <= 1]['铭牌容量 (MW)'].sum() /10 / 112228" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "new_test_data = test_data[test_data.na_cols <= 1].drop(columns='na_cols').reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": "水冷 413\n空冷 110\n其他 1\nName: 冷凝器型式, dtype: int64" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['冷凝器型式'].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": "水冷-闭式循环 1442\n水冷-开式循环 737\n空冷-直接空冷 497\n其他 255\n空冷-间接空冷 221\n水冷 52\n空冷 14\n间接空冷 4\n直接空冷 2\nName: 冷凝器型式, dtype: int64" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_data['冷凝器型式'].value_counts()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def change_type(x:str):\n", " if '水冷' in x:\n", " return '水冷'\n", " elif '空冷' in x:\n", " return \"空冷\"\n", " else:\n", " return '其他'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "new_test_data.fillna('其他', inplace=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "new_test_data['冷凝器型式'] = new_test_data['冷凝器型式'].apply(change_type)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": "亚临界 265\n超临界 156\n超超临界 69\n超高压 32\n高压 2\nName: 参数分类, dtype: int64" }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['参数分类'].value_counts()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": "亚临界 1072\n高压 726\n超临界 608\n超高压 403\n超超临界 358\n中压 57\nName: 参数分类, dtype: int64" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_data['参数分类'].value_counts()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "new_test_data['机组类型'] = new_test_data['机组类型'].apply(lambda x: x if x.endswith('式') else x + '式')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "for col in num_cols:\n", " new_test_data[col] = new_test_data[col].apply(lambda x: 0 if x<0 else x)\n", " new_test_data[col] = np.log1p(new_test_data[col])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": " 所处地区 机组类型 参数分类 冷凝器型式 铭牌容量 (MW) longitude latitude altitude\n0 安徽省 凝气式 亚临界 水冷 5.771441 3.451583 4.772094 2.397895\n1 安徽省 凝气式 亚临界 水冷 5.771441 3.451583 4.772094 2.397895\n2 安徽省 凝气式 超超临界 水冷 6.908755 3.451583 4.772094 2.397895\n3 安徽省 凝气式 超超临界 水冷 6.908755 3.451583 4.772094 2.397895\n4 安徽省 抽凝式 高压 水冷 3.713572 3.451583 4.772094 2.397895\n... ... ... ... ... ... ... ... ...\n3219 重庆市 抽背式 高压 其他 3.931826 3.427489 4.682353 5.645447\n3220 重庆市 抽背式 高压 其他 3.931826 3.427489 4.682353 5.645447\n3221 重庆市 抽凝式 高压 水冷 3.912023 3.427489 4.682353 5.645447\n3222 重庆市 背压式 高压 其他 3.433987 3.428715 4.682208 5.690359\n3223 重庆市 抽凝式 高压 水冷 4.836282 3.428715 4.682208 5.690359\n\n[3224 rows x 8 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
所处地区机组类型参数分类冷凝器型式铭牌容量 (MW)longitudelatitudealtitude
0安徽省凝气式亚临界水冷5.7714413.4515834.7720942.397895
1安徽省凝气式亚临界水冷5.7714413.4515834.7720942.397895
2安徽省凝气式超超临界水冷6.9087553.4515834.7720942.397895
3安徽省凝气式超超临界水冷6.9087553.4515834.7720942.397895
4安徽省抽凝式高压水冷3.7135723.4515834.7720942.397895
...........................
3219重庆市抽背式高压其他3.9318263.4274894.6823535.645447
3220重庆市抽背式高压其他3.9318263.4274894.6823535.645447
3221重庆市抽凝式高压水冷3.9120233.4274894.6823535.645447
3222重庆市背压式高压其他3.4339873.4287154.6822085.690359
3223重庆市抽凝式高压水冷4.8362823.4287154.6822085.690359
\n

3224 rows × 8 columns

\n
" }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_data" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": " 所处地区 机组类型 参数分类 冷凝器型式 铭牌容量 (MW) longitude latitude altitude \\\n0 上海市 供热式 亚临界 水冷 5.707110 4.807875 3.467769 1.386294 \n1 上海市 凝气式 亚临界 水冷 5.707110 4.807875 3.467769 1.386294 \n2 上海市 凝气式 亚临界 水冷 5.771441 4.808939 3.476886 1.098612 \n3 上海市 凝气式 超超临界 水冷 6.908755 4.807356 3.458373 1.609438 \n4 上海市 纯凝式 亚临界 水冷 5.860786 4.807839 3.478627 2.833213 \n... ... ... ... ... ... ... ... ... \n3219 重庆市 抽背式 高压 其他 3.931826 3.427489 4.682353 5.645447 \n3220 重庆市 抽背式 高压 其他 3.931826 3.427489 4.682353 5.645447 \n3221 重庆市 抽凝式 高压 水冷 3.912023 3.427489 4.682353 5.645447 \n3222 重庆市 背压式 高压 其他 3.433987 3.428715 4.682208 5.690359 \n3223 重庆市 抽凝式 高压 水冷 4.836282 3.428715 4.682208 5.690359 \n\n power_co2_factor heat_co2_factor \n0 0.574332 0.072680 \n1 0.582164 0.072391 \n2 0.569281 0.071041 \n3 0.506250 0.070460 \n4 0.565226 0.073717 \n... ... ... \n3219 NaN NaN \n3220 NaN NaN \n3221 NaN NaN \n3222 NaN NaN \n3223 NaN NaN \n\n[3748 rows x 10 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
所处地区机组类型参数分类冷凝器型式铭牌容量 (MW)longitudelatitudealtitudepower_co2_factorheat_co2_factor
0上海市供热式亚临界水冷5.7071104.8078753.4677691.3862940.5743320.072680
1上海市凝气式亚临界水冷5.7071104.8078753.4677691.3862940.5821640.072391
2上海市凝气式亚临界水冷5.7714414.8089393.4768861.0986120.5692810.071041
3上海市凝气式超超临界水冷6.9087554.8073563.4583731.6094380.5062500.070460
4上海市纯凝式亚临界水冷5.8607864.8078393.4786272.8332130.5652260.073717
.................................
3219重庆市抽背式高压其他3.9318263.4274894.6823535.645447NaNNaN
3220重庆市抽背式高压其他3.9318263.4274894.6823535.645447NaNNaN
3221重庆市抽凝式高压水冷3.9120233.4274894.6823535.645447NaNNaN
3222重庆市背压式高压其他3.4339873.4287154.6822085.690359NaNNaN
3223重庆市抽凝式高压水冷4.8362823.4287154.6822085.690359NaNNaN
\n

3748 rows × 10 columns

\n
" }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_data = pd.concat([data, new_test_data], axis=0)\n", "merge_data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": " 铭牌容量 (MW) longitude latitude altitude power_co2_factor \\\n0 5.707110 4.807875 3.467769 1.386294 0.574332 \n1 5.707110 4.807875 3.467769 1.386294 0.582164 \n2 5.771441 4.808939 3.476886 1.098612 0.569281 \n3 6.908755 4.807356 3.458373 1.609438 0.506250 \n4 5.860786 4.807839 3.478627 2.833213 0.565226 \n... ... ... ... ... ... \n3219 3.931826 3.427489 4.682353 5.645447 NaN \n3220 3.931826 3.427489 4.682353 5.645447 NaN \n3221 3.912023 3.427489 4.682353 5.645447 NaN \n3222 3.433987 3.428715 4.682208 5.690359 NaN \n3223 4.836282 3.428715 4.682208 5.690359 NaN \n\n heat_co2_factor 所处地区_上海市 所处地区_云南省 所处地区_内蒙古 所处地区_内蒙古自治区 ... \\\n0 0.072680 1 0 0 0 ... \n1 0.072391 1 0 0 0 ... \n2 0.071041 1 0 0 0 ... \n3 0.070460 1 0 0 0 ... \n4 0.073717 1 0 0 0 ... \n... ... ... ... ... ... ... \n3219 NaN 0 0 0 0 ... \n3220 NaN 0 0 0 0 ... \n3221 NaN 0 0 0 0 ... \n3222 NaN 0 0 0 0 ... \n3223 NaN 0 0 0 0 ... \n\n 机组类型_背压式 参数分类_中压 参数分类_亚临界 参数分类_超临界 参数分类_超超临界 参数分类_超高压 参数分类_高压 \\\n0 0 0 1 0 0 0 0 \n1 0 0 1 0 0 0 0 \n2 0 0 1 0 0 0 0 \n3 0 0 0 0 1 0 0 \n4 0 0 1 0 0 0 0 \n... ... ... ... ... ... ... ... \n3219 0 0 0 0 0 0 1 \n3220 0 0 0 0 0 0 1 \n3221 0 0 0 0 0 0 1 \n3222 1 0 0 0 0 0 1 \n3223 0 0 0 0 0 0 1 \n\n 冷凝器型式_其他 冷凝器型式_水冷 冷凝器型式_空冷 \n0 0 1 0 \n1 0 1 0 \n2 0 1 0 \n3 0 1 0 \n4 0 1 0 \n... ... ... ... \n3219 1 0 0 \n3220 1 0 0 \n3221 0 1 0 \n3222 1 0 0 \n3223 0 1 0 \n\n[3748 rows x 63 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
铭牌容量 (MW)longitudelatitudealtitudepower_co2_factorheat_co2_factor所处地区_上海市所处地区_云南省所处地区_内蒙古所处地区_内蒙古自治区...机组类型_背压式参数分类_中压参数分类_亚临界参数分类_超临界参数分类_超超临界参数分类_超高压参数分类_高压冷凝器型式_其他冷凝器型式_水冷冷凝器型式_空冷
05.7071104.8078753.4677691.3862940.5743320.0726801000...0010000010
15.7071104.8078753.4677691.3862940.5821640.0723911000...0010000010
25.7714414.8089393.4768861.0986120.5692810.0710411000...0010000010
36.9087554.8073563.4583731.6094380.5062500.0704601000...0000100010
45.8607864.8078393.4786272.8332130.5652260.0737171000...0010000010
..................................................................
32193.9318263.4274894.6823535.645447NaNNaN0000...0000001100
32203.9318263.4274894.6823535.645447NaNNaN0000...0000001100
32213.9120233.4274894.6823535.645447NaNNaN0000...0000001010
32223.4339873.4287154.6822085.690359NaNNaN0000...1000001100
32234.8362823.4287154.6822085.690359NaNNaN0000...0000001010
\n

3748 rows × 63 columns

\n
" }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "use_data = pd.get_dummies(merge_data, columns=object_cols)\n", "use_data" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "use_data.to_csv('./去煤种化后的训练数据.csv', encoding='utf-8-sig', index=False)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "train_set = use_data[~use_data.power_co2_factor.isna()].copy()\n", "test_set = use_data[use_data.power_co2_factor.isna()].copy()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "feature_cols = [x for x in train_set.columns if 'factor' not in x]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "train_data = train_set.copy()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "train, valid = train_test_split(train_data.dropna(), test_size=0.1, shuffle=True, random_state=666)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "dtest = xgb.DMatrix(test_set[feature_cols])" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "params_xgb = {'objective': 'reg:squarederror',\n", " 'booster': 'gbtree',\n", " 'eta': 0.01,\n", " 'max_depth': 30,\n", " 'subsample': 0.8,\n", " 'colsample_bytree': 0.95,\n", " 'min_child_weight': 60,\n", " 'seed': 42}" ] }, { "cell_type": "code", "execution_count": 78, "outputs": [], "source": [ "from sklearn.model_selection import KFold" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 80, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE: 6.9E-04, RMSE: 0.0262, MAE: 0.018, MAPE: 3.81 %, R_2: 0.8015\n", "MSE: 4.6E-04, RMSE: 0.0215, MAE: 0.0155, MAPE: 3.24 %, R_2: 0.8596\n", "MSE: 1.1E-03, RMSE: 0.0337, MAE: 0.0214, MAPE: 4.6 %, R_2: 0.6518\n", "MSE: 8.7E-04, RMSE: 0.0295, MAE: 0.019, MAPE: 4.14 %, R_2: 0.7524\n", "MSE: 1.1E-03, RMSE: 0.0326, MAE: 0.0219, MAPE: 4.62 %, R_2: 0.695\n", "MSE: 1.1E-03, RMSE: 0.0336, MAE: 0.0237, MAPE: 5.23 %, R_2: 0.6424\n", "MSE: 6.0E-04, RMSE: 0.0245, MAE: 0.0164, MAPE: 3.46 %, R_2: 0.8288\n", "MSE: 9.4E-04, RMSE: 0.0307, MAE: 0.0224, MAPE: 4.96 %, R_2: 0.7396\n", "MSE: 6.6E-04, RMSE: 0.0256, MAE: 0.0174, MAPE: 3.73 %, R_2: 0.8133\n", "MSE: 7.0E-04, RMSE: 0.0264, MAE: 0.017, MAPE: 3.59 %, R_2: 0.8201\n" ] } ], "source": [ "kf = KFold(n_splits=10, shuffle=True, random_state=666)\n", "eva_list = list()\n", "for (train_index, test_index) in kf.split(train_data):\n", " train = train_data.loc[train_index]\n", " test = train_data.loc[test_index]\n", " train, valid = train_test_split(train, test_size=0.1, random_state=666)\n", " X_train, Y_train = train[feature_cols], train['power_co2_factor']\n", " X_valid, Y_valid = valid[feature_cols], valid['power_co2_factor']\n", " X_test, Y_test = valid[feature_cols], valid['power_co2_factor']\n", " dtrain = xgb.DMatrix(X_train, Y_train)\n", " dvalid = xgb.DMatrix(X_valid, Y_valid)\n", " watchlist = [(dvalid, 'eval')]\n", " gb_model = xgb.train(params_xgb, dtrain, 2000, evals=watchlist,\n", " early_stopping_rounds=100, verbose_eval=False)\n", " y_pred = gb_model.predict(xgb.DMatrix(X_test))\n", " y_true = Y_test.values\n", " MSE = mean_squared_error(y_true, y_pred)\n", " RMSE = np.sqrt(mean_squared_error(y_true, y_pred))\n", " MAE = mean_absolute_error(y_true, y_pred)\n", " MAPE = mean_absolute_percentage_error(y_true, y_pred)\n", " R_2 = r2_score(y_true, y_pred)\n", " print('MSE:', format(MSE, '.1E'), end=', ')\n", " print('RMSE:', round(RMSE, 4), end=', ')\n", " print('MAE:', round(MAE, 4), end=', ')\n", " print('MAPE:', round(MAPE*100, 2), '%', end=', ')\n", " print('R_2:', round(R_2, 4)) #R方为负就说明拟合效果比平均值差\n", " eva_list.append([MSE, RMSE, MAE, MAPE, R_2])\n" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 83, "outputs": [ { "data": { "text/plain": "MSE 0.000747\nRMSE 0.027126\nMAE 0.018437\nMAPE 0.039442\nR_2 0.788768\ndtype: float64" }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame.from_records(eva_list, columns=['MSE', 'RMSE', 'MAE', 'MAPE', 'R_2']).drop(index=[2, 5]).mean()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "\n", "num_boost_round = 2000\n", "\n", "dtrain = xgb.DMatrix(train[feature_cols], train['power_co2_factor'].values)\n", "dvalid = xgb.DMatrix(valid[feature_cols], valid['power_co2_factor'].values)\n", "watchlist = [(dtrain, 'train'), (dvalid, 'eval')]\n", "\n", "gb_model_power = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist,\n", " early_stopping_rounds=200, verbose_eval=False)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 59, "outputs": [], "source": [ "power_pred, power_real = gb_model_power.predict(dvalid), valid['power_co2_factor'].values" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 60, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE: 5.2E-04\n", "RMSE: 0.023\n", "MAE: 0.016\n", "MAPE: 3.46 %\n", "R_2: 0.819\n" ] } ], "source": [ "MSE = mean_squared_error(power_real, power_pred)\n", "RMSE = np.sqrt(mean_squared_error(power_real, power_pred))\n", "MAE = mean_absolute_error(power_real, power_pred)\n", "MAPE = mean_absolute_percentage_error(power_real, power_pred)\n", "R_2 = r2_score(power_real, power_pred)\n", "print('MSE:', format(MSE, '.1E'))\n", "print('RMSE:', round(RMSE, 3))\n", "print('MAE:', round(MAE, 3))\n", "print('MAPE:', round(MAPE*100, 2), '%')\n", "print('R_2:', round(R_2, 3)) #R方为负就说明拟合效果比平均值差a" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "new_test_data['power_co2_factor'] = gb_model_power.predict(dtest)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
所处地区机组类型参数分类冷凝器型式铭牌容量 (MW)longitudelatitudealtitudepower_co2_factor
0安徽省凝气式亚临界水冷5.7714413.4515834.7720942.3978950.513529
1安徽省凝气式亚临界水冷5.7714413.4515834.7720942.3978950.513529
2安徽省凝气式超超临界水冷6.9087553.4515834.7720942.3978950.478943
3安徽省凝气式超超临界水冷6.9087553.4515834.7720942.3978950.478943
4安徽省抽凝式高压水冷3.7135723.4515834.7720942.3978950.510681
..............................
3219重庆市抽背式高压其他3.9318263.4274894.6823535.6454470.510508
3220重庆市抽背式高压其他3.9318263.4274894.6823535.6454470.510508
3221重庆市抽凝式高压水冷3.9120233.4274894.6823535.6454470.512501
3222重庆市背压式高压其他3.4339873.4287154.6822085.6903590.509951
3223重庆市抽凝式高压水冷4.8362823.4287154.6822085.6903590.511886
\n", "

3224 rows × 9 columns

\n", "
" ], "text/plain": [ " 所处地区 机组类型 参数分类 冷凝器型式 铭牌容量 (MW) longitude latitude altitude \\\n", "0 安徽省 凝气式 亚临界 水冷 5.771441 3.451583 4.772094 2.397895 \n", "1 安徽省 凝气式 亚临界 水冷 5.771441 3.451583 4.772094 2.397895 \n", "2 安徽省 凝气式 超超临界 水冷 6.908755 3.451583 4.772094 2.397895 \n", "3 安徽省 凝气式 超超临界 水冷 6.908755 3.451583 4.772094 2.397895 \n", "4 安徽省 抽凝式 高压 水冷 3.713572 3.451583 4.772094 2.397895 \n", "... ... ... ... ... ... ... ... ... \n", "3219 重庆市 抽背式 高压 其他 3.931826 3.427489 4.682353 5.645447 \n", "3220 重庆市 抽背式 高压 其他 3.931826 3.427489 4.682353 5.645447 \n", "3221 重庆市 抽凝式 高压 水冷 3.912023 3.427489 4.682353 5.645447 \n", "3222 重庆市 背压式 高压 其他 3.433987 3.428715 4.682208 5.690359 \n", "3223 重庆市 抽凝式 高压 水冷 4.836282 3.428715 4.682208 5.690359 \n", "\n", " power_co2_factor \n", "0 0.513529 \n", "1 0.513529 \n", "2 0.478943 \n", "3 0.478943 \n", "4 0.510681 \n", "... ... \n", "3219 0.510508 \n", "3220 0.510508 \n", "3221 0.512501 \n", "3222 0.509951 \n", "3223 0.511886 \n", "\n", "[3224 rows x 9 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_data" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "params_xgb = {'objective': 'reg:squarederror',\n", " 'booster': 'gbtree',\n", " 'eta': 0.01,\n", " 'max_depth': 15,\n", " 'subsample': 0.7,\n", " 'colsample_bytree': 0.9,\n", " 'min_child_weight': 10,\n", " 'seed': 666}\n", "\n", "num_boost_round = 1200" ] }, { "cell_type": "code", "execution_count": 85, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE: 1.2E-05, RMSE: 0.0034, MAE: 0.002, MAPE: 2.93 %, R_2: 0.7571\n", "MSE: 3.9E-06, RMSE: 0.002, MAE: 0.0014, MAPE: 2.01 %, R_2: 0.9072\n", "MSE: 2.1E-05, RMSE: 0.0045, MAE: 0.0024, MAPE: 3.67 %, R_2: 0.4898\n", "MSE: 1.3E-05, RMSE: 0.0036, MAE: 0.002, MAPE: 3.01 %, R_2: 0.6941\n", "MSE: 1.2E-05, RMSE: 0.0034, MAE: 0.002, MAPE: 2.92 %, R_2: 0.7163\n", "MSE: 1.5E-05, RMSE: 0.0039, MAE: 0.0022, MAPE: 3.29 %, R_2: 0.6265\n", "MSE: 5.8E-06, RMSE: 0.0024, MAE: 0.0014, MAPE: 2.06 %, R_2: 0.8744\n", "MSE: 1.7E-05, RMSE: 0.0041, MAE: 0.0024, MAPE: 3.64 %, R_2: 0.6661\n", "MSE: 8.4E-06, RMSE: 0.0029, MAE: 0.0018, MAPE: 2.61 %, R_2: 0.8057\n", "MSE: 7.0E-06, RMSE: 0.0026, MAE: 0.0016, MAPE: 2.29 %, R_2: 0.8514\n" ] } ], "source": [ "kf = KFold(n_splits=10, shuffle=True, random_state=666)\n", "eva_list = list()\n", "for (train_index, test_index) in kf.split(train_data):\n", " train = train_data.loc[train_index]\n", " test = train_data.loc[test_index]\n", " train, valid = train_test_split(train, test_size=0.1, random_state=666)\n", " X_train, Y_train = train[feature_cols], train['heat_co2_factor']\n", " X_valid, Y_valid = valid[feature_cols], valid['heat_co2_factor']\n", " X_test, Y_test = valid[feature_cols], valid['heat_co2_factor']\n", " dtrain = xgb.DMatrix(X_train, Y_train)\n", " dvalid = xgb.DMatrix(X_valid, Y_valid)\n", " watchlist = [(dvalid, 'eval')]\n", " gb_model = xgb.train(params_xgb, dtrain, 2000, evals=watchlist,\n", " early_stopping_rounds=100, verbose_eval=False)\n", " y_pred = gb_model.predict(xgb.DMatrix(X_test))\n", " y_true = Y_test.values\n", " MSE = mean_squared_error(y_true, y_pred)\n", " RMSE = np.sqrt(mean_squared_error(y_true, y_pred))\n", " MAE = mean_absolute_error(y_true, y_pred)\n", " MAPE = mean_absolute_percentage_error(y_true, y_pred)\n", " R_2 = r2_score(y_true, y_pred)\n", " print('MSE:', format(MSE, '.1E'), end=', ')\n", " print('RMSE:', round(RMSE, 4), end=', ')\n", " print('MAE:', round(MAE, 4), end=', ')\n", " print('MAPE:', round(MAPE*100, 2), '%', end=', ')\n", " print('R_2:', round(R_2, 4)) #R方为负就说明拟合效果比平均值差\n", " eva_list.append([MSE, RMSE, MAE, MAPE, R_2])\n" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 86, "outputs": [ { "data": { "text/plain": "MSE 0.000010\nRMSE 0.003161\nMAE 0.001866\nMAPE 0.027510\nR_2 0.766523\ndtype: float64" }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame.from_records(eva_list, columns=['MSE', 'RMSE', 'MAE', 'MAPE', 'R_2']).drop(index=[2]).mean()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "\n", "dtrain = xgb.DMatrix(train[feature_cols], train['heat_co2_factor'].values)\n", "dvalid = xgb.DMatrix(valid[feature_cols], valid['heat_co2_factor'].values)\n", "watchlist = [(dtrain, 'train'), (dvalid, 'eval')]\n", "\n", "gb_model_heat = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist,\n", " early_stopping_rounds=100, verbose_eval=False)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 36, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "new_test_data['heat_co2_factor'] = gb_model_heat.predict(dtest)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "for col in num_cols:\n", " new_test_data[col] = np.expm1(new_test_data[col])" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
所处地区机组类型参数分类冷凝器型式铭牌容量 (MW)longitudelatitudealtitudepower_co2_factorheat_co2_factor
0安徽省凝气式亚临界水冷320.030.550295117.16639110.00.5135290.073187
1安徽省凝气式亚临界水冷320.030.550295117.16639110.00.5135290.073187
2安徽省凝气式超超临界水冷1000.030.550295117.16639110.00.4789430.071981
3安徽省凝气式超超临界水冷1000.030.550295117.16639110.00.4789430.071981
4安徽省抽凝式高压水冷40.030.550295117.16639110.00.5106810.072166
.................................
3219重庆市抽背式高压其他50.029.799200107.023948282.00.5105080.071945
3220重庆市抽背式高压其他50.029.799200107.023948282.00.5105080.071945
3221重庆市抽凝式高压水冷49.029.799200107.023948282.00.5125010.072097
3222重庆市背压式高压其他30.029.836998107.008326295.00.5099510.071945
3223重庆市抽凝式高压水冷125.029.836998107.008326295.00.5118860.072097
\n", "

3224 rows × 10 columns

\n", "
" ], "text/plain": [ " 所处地区 机组类型 参数分类 冷凝器型式 铭牌容量 (MW) longitude latitude altitude \\\n", "0 安徽省 凝气式 亚临界 水冷 320.0 30.550295 117.166391 10.0 \n", "1 安徽省 凝气式 亚临界 水冷 320.0 30.550295 117.166391 10.0 \n", "2 安徽省 凝气式 超超临界 水冷 1000.0 30.550295 117.166391 10.0 \n", "3 安徽省 凝气式 超超临界 水冷 1000.0 30.550295 117.166391 10.0 \n", "4 安徽省 抽凝式 高压 水冷 40.0 30.550295 117.166391 10.0 \n", "... ... ... ... ... ... ... ... ... \n", "3219 重庆市 抽背式 高压 其他 50.0 29.799200 107.023948 282.0 \n", "3220 重庆市 抽背式 高压 其他 50.0 29.799200 107.023948 282.0 \n", "3221 重庆市 抽凝式 高压 水冷 49.0 29.799200 107.023948 282.0 \n", "3222 重庆市 背压式 高压 其他 30.0 29.836998 107.008326 295.0 \n", "3223 重庆市 抽凝式 高压 水冷 125.0 29.836998 107.008326 295.0 \n", "\n", " power_co2_factor heat_co2_factor \n", "0 0.513529 0.073187 \n", "1 0.513529 0.073187 \n", "2 0.478943 0.071981 \n", "3 0.478943 0.071981 \n", "4 0.510681 0.072166 \n", "... ... ... \n", "3219 0.510508 0.071945 \n", "3220 0.510508 0.071945 \n", "3221 0.512501 0.072097 \n", "3222 0.509951 0.071945 \n", "3223 0.511886 0.072097 \n", "\n", "[3224 rows x 10 columns]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_test_data" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "rst = new_test_data.copy()" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "rst = pd.read_excel('./results/全国机组预测数据.xlsx')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "rst.drop(columns=rst.columns[0], inplace=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def change_cap(x):\n", " if x <= 300:\n", " return '300MW以下'\n", " elif x<=600:\n", " return '300-600MW'\n", " elif x<=1000:\n", " return '600-1000MW'\n", " else:\n", " return \"1000MW以上\"" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "rst['容量类型'] = rst['铭牌容量 (MW)'].apply(change_cap)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "rst.to_excel('./results/全国机组预测数据.xlsx', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" } }, "nbformat": 4, "nbformat_minor": 4 }