41 lines
1.7 KiB
Python
41 lines
1.7 KiB
Python
import pandas as pd
|
|
from fuzzywuzzy import fuzz
|
|
|
|
# 读取两个Excel文件
|
|
df1 = pd.read_excel(r'C:\Users\97532\Desktop\陕西、福建_陈晓东\福建省.xlsx')
|
|
df2 = pd.read_excel(r'C:\Users\97532\Desktop\陕西、福建_陈晓东\福建省_许可.xlsx')
|
|
|
|
# 打印df2的列名,确保列名正确
|
|
print("原始列名:", df2.columns)
|
|
df2.columns = df2.columns.str.strip()
|
|
# 确保列数据类型为字符串
|
|
df1['电厂名称'] = df1['电厂名称'].astype(str)
|
|
df2['电厂名称/项目名称'] = df2['电厂名称/项目名称'].astype(str)
|
|
df1['发电装机容量'] = df1['发电装机容量'].astype(float)
|
|
df2['总装机容量'] = df2['总装机容量'].astype(float)
|
|
|
|
|
|
# 定义一个函数来进行模糊匹配
|
|
def fuzzy_match(row, df2):
|
|
best_match = None
|
|
highest_score = 0
|
|
for index2, row2 in df2.iterrows():
|
|
score = fuzz.partial_ratio(row['电厂名称'], row2['电厂名称/项目名称'])
|
|
if score > highest_score and abs(row['发电装机容量'] - row2['总装机容量']) < 0.01:
|
|
highest_score = score
|
|
best_match = row2
|
|
return best_match
|
|
|
|
|
|
# 进行模糊匹配并填充数据
|
|
for index1, row1 in df1.iterrows():
|
|
match_row = fuzzy_match(row1, df2)
|
|
if match_row is not None:
|
|
df1.at[index1, '单位名称'] = match_row['单位名称']
|
|
df1.at[index1, '发电许可证编号'] = match_row['发电许可证编号']
|
|
df1.at[index1, '电厂名称/项目名称'] = match_row['电厂名称/项目名称']
|
|
df1.at[index1, '机组类型'] = match_row['机组类型']
|
|
# 将结果保存回Excel文件
|
|
df1.to_excel(r'C:\Users\97532\Desktop\陕西、福建_陈晓东\福建省_填充后_模糊匹配.xlsx', index=False)
|
|
print("数据填充已保存")
|