LCA_LLM_application/Retrieval_new/utils.py

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import jieba.posseg as pseg

# # 下载相关数据
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
import string
import re
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
import logging
from typing import Optional
import time

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('translation_service')

def preprocess_eng(text):
    '''
    英文文本预处理：小写化，去除标点（待定），去除特殊符号，只保留单词
    拼写是否正确：是，因为是从ecoinvent导入的，没有拼写错误；
    词干提取(stemming)和词形还原(lemmatization)：可以处理一下，有的提取不准确，不做此操作
    '''
    # 去除标点
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 去除数字
    text = re.sub(r'\d+', ' ', text)
    # 去除多余字符
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
     # 去除多余空格
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_zh(text):
    '''
    中文文本预处理:只保留中文内容，去除英文、数字和标点
    '''
    text = str(text)
    # 去除英文
    text = re.sub(r'[a-zA-Z]',' ',text)
    text = re.sub(r'\d', ' ', text)
    # 去除中文标点符号
    text = re.sub(r'[，。！？、；：“”（）《》【】-]', ' ', text)
    # 去除英文标点符号
    text = re.sub(r'[.,!?;:"\'\(\)\[\]{}]', ' ', text)
    # 去除空格
    text = re.sub(r'\s+','',text)

    return text

# 英文名词处理
def get_noun_en(text):
    # 分词
    words = word_tokenize(text)
    # 词性标注
    tagged = pos_tag(words)

    # 提取名词
    nouns = [word for word, tag in tagged if tag.startswith('NN')]
    noun = ' '.join(nouns)
    return noun

# 中文名词提取
def get_noun_zh(text):
    x = str(text)
    if x=='nan':
        return ''
    words = pseg.cut(text)
    nouns = [word for word, flag in words if flag.startswith('n')]
    noun = ' '.join(nouns)
    return noun

def all_chinese(text):
    """
    判断一个文本是否不包含中文字符
    
    参数:
        text (str): 需要检查的文本
        
    返回:
        bool: 如果文本中没有中文字符返回True，否则返回False
    """
    for char in text:
        if '\u4e00' <= char <= '\u9fff' or \
           '\u3400' <= char <= '\u4dbf' or \
           '\u2f00' <= char <= '\u2fdf' or \
           '\u3100' <= char <= '\u312f' or \
           '\u31a0' <= char <= '\u31bf':
            flag = 1
        else:
            return False
    return True

   
def extract_list(text: str) -> Optional[str]:
    """从文本中提取方括号内的内容"""
    if not isinstance(text, str):
        return None
        
    try:
        pattern = r'\[(.*?)\]'
        matches = re.findall(pattern, text)
        
        if not matches:
            return None
            
        return matches[-1]
    except Exception as e:
        logger.error(f"字符串处理异常: {e}")
        return None
    
def translate(query: str) -> Optional[str]:
    """
    将查询中的英文翻译为中文。
    如果提取列表为空，最多重试三次。
    """
    if not query or not isinstance(query, str):
        return None
        
    sys_template = '''
    你是一个专注于化工、环境学科领域的翻译专家。
    用户将提供一个生命周期评价领域数据库的查询，查询可能包含中英文字符。你的任务是：

    1. 将查询中的所有英文表述转化为对应的中文表述；
    2. 确保转化后的查询中不含任何非中文语言；
    3. 将完整的中文查询以"[]"格式返回；
    4. 不返回除"[]"格式外的任何其他内容。
    请严格按照上述要求执行。
    '''
    human_template = "查询内容为:{context}"

    chat_prompt = ChatPromptTemplate.from_messages([
        ("system", sys_template),
        ("human", human_template)
    ])

    messages = chat_prompt.format_messages(context=query)
    
    llm = ChatOpenAI(
        model="deepseek-chat",
        base_url="https://api.deepseek.com",
        api_key="sk-3e42e538bc39411ab80761106d83dda9",
        temperature=0,
    )
    
    # 最多尝试三次
    max_attempts = 3
    
    for attempt in range(1, max_attempts + 1):
        logger.info(f"翻译尝试 {attempt}/{max_attempts}：{query[:50]}{'...' if len(query) > 50 else ''}")
        
        try:
            # 调用API获取翻译结果
            response = llm.invoke(messages)
            content = response.content
            
            # 尝试提取结果
            result = extract_list(content)
            
            # 如果成功提取到结果，直接返回
            if result is not None:
                logger.info(f"成功提取翻译结果 (尝试 {attempt}/{max_attempts})")
                return result
                
            # 提取失败，记录信息
            logger.warning(f"未能提取翻译结果 (尝试 {attempt}/{max_attempts}): {content[:100]}")
            
            # 如果已经是最后一次尝试，则返回None
            if attempt == max_attempts:
                logger.error("所有尝试均失败，无法获取有效翻译结果")
                return None
                
            # 短暂等待后继续下一次尝试
            time.sleep(1)
            
        except Exception as e:
            logger.error(f"翻译过程中发生异常 (尝试 {attempt}/{max_attempts}): {e}")
            
            if attempt == max_attempts:
                return None
    
    return None

# 使用示例
if __name__ == "__main__":
    query = "HCOOH"
    result = translate(query)
    if result:
        print(f"翻译结果: {result}")
    else:
        print("翻译失败")