LCA_LLM_application/Retrieval_new/utils.py

87 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import jieba.posseg as pseg
# 下载相关数据
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
import string
import re
def preprocess_eng(text):
'''
英文文本预处理:小写化,去除标点(待定),去除特殊符号,只保留单词
拼写是否正确因为是从ecoinvent导入的没有拼写错误
词干提取(stemming)和词形还原(lemmatization):可以处理一下,有的提取不准确,不做此操作
'''
# 去除标点
text = text.translate(str.maketrans('', '', string.punctuation))
# 去除数字
text = re.sub(r'\d+', ' ', text)
# 去除多余字符
text = re.sub(r'[^A-Za-z0-9\s]', '', text)
# 去除多余空格
text = re.sub(r'\s+', ' ', text)
return text
def preprocess_zh(text):
'''
中文文本预处理:只保留中文内容,去除英文、数字和标点
'''
text = str(text)
# 去除英文
text = re.sub(r'[a-zA-Z]',' ',text)
text = re.sub(r'\d', ' ', text)
# 去除中文标点符号
text = re.sub(r'[,。!?、;:“”()《》【】-]', ' ', text)
# 去除英文标点符号
text = re.sub(r'[.,!?;:"\'\(\)\[\]{}]', ' ', text)
# 去除空格
text = re.sub(r'\s+','',text)
return text
# 英文名词处理
def get_noun_en(text):
# 分词
words = word_tokenize(text)
# 词性标注
tagged = pos_tag(words)
# 提取名词
nouns = [word for word, tag in tagged if tag.startswith('NN')]
noun = ' '.join(nouns)
return noun
# 中文名词提取
def get_noun_zh(text):
x = str(text)
if x=='nan':
return ''
words = pseg.cut(text)
nouns = [word for word, flag in words if flag.startswith('n')]
noun = ' '.join(nouns)
return noun
def has_no_chinese(text):
"""
判断一个文本是否不包含中文字符
参数:
text (str): 需要检查的文本
返回:
bool: 如果文本中没有中文字符返回True否则返回False
"""
for char in text:
if '\u4e00' <= char <= '\u9fff' or \
'\u3400' <= char <= '\u4dbf' or \
'\u2f00' <= char <= '\u2fdf' or \
'\u3100' <= char <= '\u312f' or \
'\u31a0' <= char <= '\u31bf':
return False
return True