87 lines
2.4 KiB
Python
87 lines
2.4 KiB
Python
|
import nltk
|
|||
|
from nltk.tokenize import word_tokenize
|
|||
|
from nltk import pos_tag
|
|||
|
import jieba.posseg as pseg
|
|||
|
|
|||
|
# 下载相关数据
|
|||
|
nltk.download('punkt')
|
|||
|
nltk.download('averaged_perceptron_tagger')
|
|||
|
|
|||
|
|
|||
|
from nltk.stem import WordNetLemmatizer
|
|||
|
import string
|
|||
|
import re
|
|||
|
|
|||
|
def preprocess_eng(text):
|
|||
|
'''
|
|||
|
英文文本预处理:小写化,去除标点(待定),去除特殊符号,只保留单词
|
|||
|
拼写是否正确:是,因为是从ecoinvent导入的,没有拼写错误;
|
|||
|
词干提取(stemming)和词形还原(lemmatization):可以处理一下,有的提取不准确,不做此操作
|
|||
|
'''
|
|||
|
# 去除标点
|
|||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
|||
|
# 去除数字
|
|||
|
text = re.sub(r'\d+', ' ', text)
|
|||
|
# 去除多余字符
|
|||
|
text = re.sub(r'[^A-Za-z0-9\s]', '', text)
|
|||
|
# 去除多余空格
|
|||
|
text = re.sub(r'\s+', ' ', text)
|
|||
|
return text
|
|||
|
|
|||
|
def preprocess_zh(text):
|
|||
|
'''
|
|||
|
中文文本预处理:只保留中文内容,去除英文、数字和标点
|
|||
|
'''
|
|||
|
text = str(text)
|
|||
|
# 去除英文
|
|||
|
text = re.sub(r'[a-zA-Z]',' ',text)
|
|||
|
text = re.sub(r'\d', ' ', text)
|
|||
|
# 去除中文标点符号
|
|||
|
text = re.sub(r'[,。!?、;:“”()《》【】-]', ' ', text)
|
|||
|
# 去除英文标点符号
|
|||
|
text = re.sub(r'[.,!?;:"\'\(\)\[\]{}]', ' ', text)
|
|||
|
# 去除空格
|
|||
|
text = re.sub(r'\s+','',text)
|
|||
|
|
|||
|
return text
|
|||
|
|
|||
|
# 英文名词处理
|
|||
|
def get_noun_en(text):
|
|||
|
# 分词
|
|||
|
words = word_tokenize(text)
|
|||
|
# 词性标注
|
|||
|
tagged = pos_tag(words)
|
|||
|
|
|||
|
# 提取名词
|
|||
|
nouns = [word for word, tag in tagged if tag.startswith('NN')]
|
|||
|
noun = ' '.join(nouns)
|
|||
|
return noun
|
|||
|
|
|||
|
# 中文名词提取
|
|||
|
def get_noun_zh(text):
|
|||
|
x = str(text)
|
|||
|
if x=='nan':
|
|||
|
return ''
|
|||
|
words = pseg.cut(text)
|
|||
|
nouns = [word for word, flag in words if flag.startswith('n')]
|
|||
|
noun = ' '.join(nouns)
|
|||
|
return noun
|
|||
|
|
|||
|
def has_no_chinese(text):
|
|||
|
"""
|
|||
|
判断一个文本是否不包含中文字符
|
|||
|
|
|||
|
参数:
|
|||
|
text (str): 需要检查的文本
|
|||
|
|
|||
|
返回:
|
|||
|
bool: 如果文本中没有中文字符返回True,否则返回False
|
|||
|
"""
|
|||
|
for char in text:
|
|||
|
if '\u4e00' <= char <= '\u9fff' or \
|
|||
|
'\u3400' <= char <= '\u4dbf' or \
|
|||
|
'\u2f00' <= char <= '\u2fdf' or \
|
|||
|
'\u3100' <= char <= '\u312f' or \
|
|||
|
'\u31a0' <= char <= '\u31bf':
|
|||
|
return False
|
|||
|
return True
|