LCA_LLM_application/Retrieval_new/utils.py

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import jieba.posseg as pseg

# 下载相关数据
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


from nltk.stem import WordNetLemmatizer
import string
import re

def preprocess_eng(text):
    '''
    英文文本预处理：小写化，去除标点（待定），去除特殊符号，只保留单词
    拼写是否正确：是，因为是从ecoinvent导入的，没有拼写错误；
    词干提取(stemming)和词形还原(lemmatization)：可以处理一下，有的提取不准确，不做此操作
    '''
    # 去除标点
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 去除数字
    text = re.sub(r'\d+', ' ', text)
    # 去除多余字符
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
     # 去除多余空格
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_zh(text):
    '''
    中文文本预处理:只保留中文内容，去除英文、数字和标点
    '''
    text = str(text)
    # 去除英文
    text = re.sub(r'[a-zA-Z]',' ',text)
    text = re.sub(r'\d', ' ', text)
    # 去除中文标点符号
    text = re.sub(r'[，。！？、；：“”（）《》【】-]', ' ', text)
    # 去除英文标点符号
    text = re.sub(r'[.,!?;:"\'\(\)\[\]{}]', ' ', text)
    # 去除空格
    text = re.sub(r'\s+','',text)

    return text

# 英文名词处理
def get_noun_en(text):
    # 分词
    words = word_tokenize(text)
    # 词性标注
    tagged = pos_tag(words)

    # 提取名词
    nouns = [word for word, tag in tagged if tag.startswith('NN')]
    noun = ' '.join(nouns)
    return noun

# 中文名词提取
def get_noun_zh(text):
    x = str(text)
    if x=='nan':
        return ''
    words = pseg.cut(text)
    nouns = [word for word, flag in words if flag.startswith('n')]
    noun = ' '.join(nouns)
    return noun

def has_no_chinese(text):
    """
    判断一个文本是否不包含中文字符

    参数:
        text (str): 需要检查的文本

    返回:
        bool: 如果文本中没有中文字符返回True，否则返回False
    """
    for char in text:
        if '\u4e00' <= char <= '\u9fff' or \
           '\u3400' <= char <= '\u4dbf' or \
           '\u2f00' <= char <= '\u2fdf' or \
           '\u3100' <= char <= '\u312f' or \
           '\u31a0' <= char <= '\u31bf':
            return False
    return True