T85_code/两张表特征对齐.ipynb

592 lines
19 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "8950aafd-80e8-4078-874c-966efdc4b0ac",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "50832980-f7e1-4a19-a5e0-b8a378ebd39b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>电厂名称</th>\n",
" <th>机组编号</th>\n",
" <th>铭牌容量 (MW)</th>\n",
" <th>机组类型</th>\n",
" <th>参数分类</th>\n",
" <th>冷凝器型式</th>\n",
" <th>入炉煤低位热值(kJ/kg)</th>\n",
" <th>燃煤挥发份Var(%)</th>\n",
" <th>燃煤灰份Aar(%)</th>\n",
" <th>煤种</th>\n",
" <th>所处地区</th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>altitude</th>\n",
" <th>发电碳排放因子(kg/kWh)</th>\n",
" <th>供热碳排放因子(kg/MJ)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>江苏利港电力有限公司</td>\n",
" <td>1</td>\n",
" <td>350.0</td>\n",
" <td>纯凝式</td>\n",
" <td>亚临界</td>\n",
" <td>水冷</td>\n",
" <td>21602.05000</td>\n",
" <td>26.09</td>\n",
" <td>16.80</td>\n",
" <td>烟煤</td>\n",
" <td>江苏省</td>\n",
" <td>120.09662</td>\n",
" <td>31.942361</td>\n",
" <td>1.0</td>\n",
" <td>0.586990</td>\n",
" <td>0.076843</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>江苏利港电力有限公司</td>\n",
" <td>1</td>\n",
" <td>350.0</td>\n",
" <td>纯凝式</td>\n",
" <td>亚临界</td>\n",
" <td>水冷</td>\n",
" <td>21926.81000</td>\n",
" <td>26.68</td>\n",
" <td>15.41</td>\n",
" <td>烟煤</td>\n",
" <td>江苏省</td>\n",
" <td>120.09662</td>\n",
" <td>31.942361</td>\n",
" <td>1.0</td>\n",
" <td>0.632859</td>\n",
" <td>0.077676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>江苏利港电力有限公司</td>\n",
" <td>1</td>\n",
" <td>350.0</td>\n",
" <td>纯凝式</td>\n",
" <td>亚临界</td>\n",
" <td>水冷</td>\n",
" <td>21261.93062</td>\n",
" <td>26.46</td>\n",
" <td>15.18</td>\n",
" <td>烟煤</td>\n",
" <td>江苏省</td>\n",
" <td>120.09662</td>\n",
" <td>31.942361</td>\n",
" <td>1.0</td>\n",
" <td>0.609196</td>\n",
" <td>0.074823</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 电厂名称 机组编号 铭牌容量 (MW) 机组类型 参数分类 冷凝器型式 入炉煤低位热值(kJ/kg) 燃煤挥发份Var(%) \\\n",
"0 江苏利港电力有限公司 1 350.0 纯凝式 亚临界 水冷 21602.05000 26.09 \n",
"1 江苏利港电力有限公司 1 350.0 纯凝式 亚临界 水冷 21926.81000 26.68 \n",
"2 江苏利港电力有限公司 1 350.0 纯凝式 亚临界 水冷 21261.93062 26.46 \n",
"\n",
" 燃煤灰份Aar(%) 煤种 所处地区 longitude latitude altitude 发电碳排放因子(kg/kWh) \\\n",
"0 16.80 烟煤 江苏省 120.09662 31.942361 1.0 0.586990 \n",
"1 15.41 烟煤 江苏省 120.09662 31.942361 1.0 0.632859 \n",
"2 15.18 烟煤 江苏省 120.09662 31.942361 1.0 0.609196 \n",
"\n",
" 供热碳排放因子(kg/MJ) \n",
"0 0.076843 \n",
"1 0.077676 \n",
"2 0.074823 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total_data = pd.read_csv('train_data.csv')\n",
"total_data.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "499cac72-c6a3-4b86-8aed-6fc010b12693",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5741, 16)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7ab5d82e-19bd-4aa4-9cd6-d2004718b00d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>发电类型</th>\n",
" <th>地区</th>\n",
" <th>城市</th>\n",
" <th>企业名称</th>\n",
" <th>机组编号</th>\n",
" <th>机组状态</th>\n",
" <th>机组数量</th>\n",
" <th>单机容量MW</th>\n",
" <th>总容量MW</th>\n",
" <th>核心设备类型</th>\n",
" <th>汽轮机类型</th>\n",
" <th>压力参数</th>\n",
" <th>冷却方式</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>煤电</td>\n",
" <td>安徽省</td>\n",
" <td>安庆市</td>\n",
" <td>国能神皖安庆发电有限责任公司</td>\n",
" <td>1</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>320.0</td>\n",
" <td>320.0</td>\n",
" <td>煤粉锅炉</td>\n",
" <td>凝气式</td>\n",
" <td>亚临界</td>\n",
" <td>水冷-开式循环</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>煤电</td>\n",
" <td>安徽省</td>\n",
" <td>安庆市</td>\n",
" <td>国能神皖安庆发电有限责任公司</td>\n",
" <td>2</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>320.0</td>\n",
" <td>320.0</td>\n",
" <td>煤粉锅炉</td>\n",
" <td>凝气式</td>\n",
" <td>亚临界</td>\n",
" <td>水冷-开式循环</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>煤电</td>\n",
" <td>安徽省</td>\n",
" <td>安庆市</td>\n",
" <td>国能神皖安庆发电有限责任公司</td>\n",
" <td>3</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>1000.0</td>\n",
" <td>1000.0</td>\n",
" <td>煤粉锅炉</td>\n",
" <td>凝气式</td>\n",
" <td>超超临界</td>\n",
" <td>水冷-闭式循环</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 发电类型 地区 城市 企业名称 机组编号 机组状态 机组数量 单机容量MW 总容量MW 核心设备类型 \\\n",
"0 煤电 安徽省 安庆市 国能神皖安庆发电有限责任公司 1 在役 1 320.0 320.0 煤粉锅炉 \n",
"1 煤电 安徽省 安庆市 国能神皖安庆发电有限责任公司 2 在役 1 320.0 320.0 煤粉锅炉 \n",
"2 煤电 安徽省 安庆市 国能神皖安庆发电有限责任公司 3 在役 1 1000.0 1000.0 煤粉锅炉 \n",
"\n",
" 汽轮机类型 压力参数 冷却方式 \n",
"0 凝气式 亚临界 水冷-开式循环 \n",
"1 凝气式 亚临界 水冷-开式循环 \n",
"2 凝气式 超超临界 水冷-闭式循环 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unit_data = pd.read_excel('./data/煤电机组情况(含企业名称).xlsx')\n",
"unit_data.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "c4d54203-5343-43df-b594-f6a13e6f47a1",
"metadata": {},
"outputs": [],
"source": [
"total_data.rename(columns={'电厂名称':'企业名称'}, inplace=True)\n",
"total_data['机组编号'] = total_data['机组编号'].astype('str')\n",
"unit_data['机组编号'] = unit_data['机组编号'].astype('str')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "757e26c3-cd1b-48a3-9668-78e13f40436f",
"metadata": {},
"outputs": [],
"source": [
"def change_type(x:str):\n",
" if pd.isna(x):\n",
" return x\n",
" x = x.strip()\n",
" if '纯凝' in x:\n",
" return '纯凝式'\n",
" if '供热' in x:\n",
" return '供热式'\n",
" if '煤粉' in x:\n",
" return '煤粉锅炉'\n",
" if x.startswith('循环流化床'):\n",
" return '循环流化床锅炉'\n",
" if '三废' in x:\n",
" return '三废炉'\n",
" if '直接空冷' in x:\n",
" return '直接空冷'\n",
" if '间接空冷' in x:\n",
" return '间接空冷'\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "fcc7c556-ae7b-4be1-9163-709ce1ca084c",
"metadata": {},
"outputs": [],
"source": [
"merge_data = total_data.merge(unit_data[['企业名称','机组编号','汽轮机类型', '压力参数', '冷却方式']], how='left', on=['企业名称', '机组编号'])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "3af6ad2f-a881-4ee6-9a27-ecbe75c97b31",
"metadata": {},
"outputs": [],
"source": [
"merge_data['机组类型'] = merge_data.apply(lambda x: x['机组类型'] if pd.isna(x['汽轮机类型']) else x['汽轮机类型'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "dec103bc-c868-4557-ba83-9bbb02f8e9f8",
"metadata": {},
"outputs": [],
"source": [
"merge_data['参数分类'] = merge_data.apply(lambda x: x['参数分类'] if pd.isna(x['压力参数']) else x['压力参数'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "454273f0-51ab-4a75-9c44-9ae8b7cc2a79",
"metadata": {},
"outputs": [],
"source": [
"merge_data['冷凝器型式'] = merge_data.apply(lambda x: x['冷凝器型式'] if pd.isna(x['冷却方式']) else x['冷却方式'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "d3c9cb26-63b4-4c72-9c5b-d90a2c5867ca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"水冷-闭式循环 2143\n",
"水冷 1166\n",
"水冷-开式循环 1101\n",
"空冷-直接空冷 492\n",
"直接空冷 241\n",
"空冷-间接空冷 154\n",
"间接空冷 74\n",
"空冷 19\n",
"其他 2\n",
"Name: 冷凝器型式, dtype: int64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_data['冷凝器型式'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "30b2d793-7b44-434a-96e3-c6ce15295881",
"metadata": {},
"outputs": [],
"source": [
"use_data = merge_data[merge_data.columns[:-3]].copy()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "fbdf13c0-6174-463b-9dd0-9ed736e6d126",
"metadata": {},
"outputs": [],
"source": [
"for col in ['机组类型', '参数分类', '冷凝器型式']:\n",
" use_data[col] = use_data[col].apply(change_type)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "ff803c5a-5e56-462b-81fc-639877395d69",
"metadata": {},
"outputs": [],
"source": [
"use_data.to_excel('train_data.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "28d8d579-d816-4117-8c49-a755fdffe1a1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>发电类型</th>\n",
" <th>地区</th>\n",
" <th>城市</th>\n",
" <th>企业名称</th>\n",
" <th>机组编号</th>\n",
" <th>机组状态</th>\n",
" <th>机组数量</th>\n",
" <th>单机容量MW</th>\n",
" <th>总容量MW</th>\n",
" <th>核心设备类型</th>\n",
" <th>汽轮机类型</th>\n",
" <th>压力参数</th>\n",
" <th>冷却方式</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4026</th>\n",
" <td>煤电</td>\n",
" <td>山西省</td>\n",
" <td>临汾市</td>\n",
" <td>国家能源集团华北电力有限公司霍州发电厂</td>\n",
" <td>1</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>600.0</td>\n",
" <td>600.0</td>\n",
" <td>煤粉锅炉</td>\n",
" <td>凝气式</td>\n",
" <td>超临界</td>\n",
" <td>空冷-直接空冷</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4027</th>\n",
" <td>煤电</td>\n",
" <td>山西省</td>\n",
" <td>临汾市</td>\n",
" <td>国家能源集团华北电力有限公司霍州发电厂</td>\n",
" <td>2</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>600.0</td>\n",
" <td>600.0</td>\n",
" <td>煤粉锅炉</td>\n",
" <td>凝气式</td>\n",
" <td>超临界</td>\n",
" <td>空冷-直接空冷</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4056</th>\n",
" <td>煤电</td>\n",
" <td>山西省</td>\n",
" <td>吕梁市</td>\n",
" <td>霍州煤电集团吕梁山煤电有限公司方山发电厂</td>\n",
" <td>1</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>60.0</td>\n",
" <td>60.0</td>\n",
" <td>循环流化床锅炉</td>\n",
" <td>抽凝式</td>\n",
" <td>高压</td>\n",
" <td>空冷-直接空冷</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4057</th>\n",
" <td>煤电</td>\n",
" <td>山西省</td>\n",
" <td>吕梁市</td>\n",
" <td>霍州煤电集团吕梁山煤电有限公司方山发电厂</td>\n",
" <td>2</td>\n",
" <td>在役</td>\n",
" <td>1</td>\n",
" <td>60.0</td>\n",
" <td>60.0</td>\n",
" <td>循环流化床锅炉</td>\n",
" <td>抽凝式</td>\n",
" <td>高压</td>\n",
" <td>空冷-直接空冷</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 发电类型 地区 城市 企业名称 机组编号 机组状态 机组数量 单机容量MW 总容量MW \\\n",
"4026 煤电 山西省 临汾市 国家能源集团华北电力有限公司霍州发电厂 1 在役 1 600.0 600.0 \n",
"4027 煤电 山西省 临汾市 国家能源集团华北电力有限公司霍州发电厂 2 在役 1 600.0 600.0 \n",
"4056 煤电 山西省 吕梁市 霍州煤电集团吕梁山煤电有限公司方山发电厂 1 在役 1 60.0 60.0 \n",
"4057 煤电 山西省 吕梁市 霍州煤电集团吕梁山煤电有限公司方山发电厂 2 在役 1 60.0 60.0 \n",
"\n",
" 核心设备类型 汽轮机类型 压力参数 冷却方式 \n",
"4026 煤粉锅炉 凝气式 超临界 空冷-直接空冷 \n",
"4027 煤粉锅炉 凝气式 超临界 空冷-直接空冷 \n",
"4056 循环流化床锅炉 抽凝式 高压 空冷-直接空冷 \n",
"4057 循环流化床锅炉 抽凝式 高压 空冷-直接空冷 "
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unit_data[unit_data['企业名称'].str.contains('霍州')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20c531d0-62eb-4475-ab3a-3c8477f36a55",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}