2025-03-19 21:02:49 +08:00
|
|
|
|
import nltk
|
|
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
|
from nltk import pos_tag
|
|
|
|
|
import jieba.posseg as pseg
|
|
|
|
|
|
2025-03-21 16:09:46 +08:00
|
|
|
|
# # 下载相关数据
|
|
|
|
|
# nltk.download('punkt')
|
|
|
|
|
# nltk.download('averaged_perceptron_tagger')
|
2025-03-19 21:02:49 +08:00
|
|
|
|
|
|
|
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
import string
|
|
|
|
|
import re
|
2025-03-21 16:09:46 +08:00
|
|
|
|
from langchain.prompts import ChatPromptTemplate
|
|
|
|
|
from langchain.schema import SystemMessage, HumanMessage
|
|
|
|
|
from langchain_openai import ChatOpenAI
|
2025-03-31 10:07:03 +08:00
|
|
|
|
import logging
|
|
|
|
|
from typing import Optional
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
# 配置日志
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
|
|
)
|
|
|
|
|
logger = logging.getLogger('translation_service')
|
2025-03-19 21:02:49 +08:00
|
|
|
|
|
|
|
|
|
def preprocess_eng(text):
|
|
|
|
|
'''
|
|
|
|
|
英文文本预处理:小写化,去除标点(待定),去除特殊符号,只保留单词
|
|
|
|
|
拼写是否正确:是,因为是从ecoinvent导入的,没有拼写错误;
|
|
|
|
|
词干提取(stemming)和词形还原(lemmatization):可以处理一下,有的提取不准确,不做此操作
|
|
|
|
|
'''
|
|
|
|
|
# 去除标点
|
|
|
|
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
|
|
|
|
# 去除数字
|
|
|
|
|
text = re.sub(r'\d+', ' ', text)
|
|
|
|
|
# 去除多余字符
|
|
|
|
|
text = re.sub(r'[^A-Za-z0-9\s]', '', text)
|
|
|
|
|
# 去除多余空格
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
def preprocess_zh(text):
|
|
|
|
|
'''
|
|
|
|
|
中文文本预处理:只保留中文内容,去除英文、数字和标点
|
|
|
|
|
'''
|
|
|
|
|
text = str(text)
|
|
|
|
|
# 去除英文
|
|
|
|
|
text = re.sub(r'[a-zA-Z]',' ',text)
|
|
|
|
|
text = re.sub(r'\d', ' ', text)
|
|
|
|
|
# 去除中文标点符号
|
|
|
|
|
text = re.sub(r'[,。!?、;:“”()《》【】-]', ' ', text)
|
|
|
|
|
# 去除英文标点符号
|
|
|
|
|
text = re.sub(r'[.,!?;:"\'\(\)\[\]{}]', ' ', text)
|
|
|
|
|
# 去除空格
|
|
|
|
|
text = re.sub(r'\s+','',text)
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
# 英文名词处理
|
|
|
|
|
def get_noun_en(text):
|
|
|
|
|
# 分词
|
|
|
|
|
words = word_tokenize(text)
|
|
|
|
|
# 词性标注
|
|
|
|
|
tagged = pos_tag(words)
|
|
|
|
|
|
|
|
|
|
# 提取名词
|
|
|
|
|
nouns = [word for word, tag in tagged if tag.startswith('NN')]
|
|
|
|
|
noun = ' '.join(nouns)
|
|
|
|
|
return noun
|
|
|
|
|
|
|
|
|
|
# 中文名词提取
|
|
|
|
|
def get_noun_zh(text):
|
|
|
|
|
x = str(text)
|
|
|
|
|
if x=='nan':
|
|
|
|
|
return ''
|
|
|
|
|
words = pseg.cut(text)
|
|
|
|
|
nouns = [word for word, flag in words if flag.startswith('n')]
|
|
|
|
|
noun = ' '.join(nouns)
|
|
|
|
|
return noun
|
|
|
|
|
|
2025-03-31 10:07:03 +08:00
|
|
|
|
def all_chinese(text):
|
2025-03-19 21:02:49 +08:00
|
|
|
|
"""
|
|
|
|
|
判断一个文本是否不包含中文字符
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
text (str): 需要检查的文本
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 如果文本中没有中文字符返回True,否则返回False
|
|
|
|
|
"""
|
|
|
|
|
for char in text:
|
|
|
|
|
if '\u4e00' <= char <= '\u9fff' or \
|
|
|
|
|
'\u3400' <= char <= '\u4dbf' or \
|
|
|
|
|
'\u2f00' <= char <= '\u2fdf' or \
|
|
|
|
|
'\u3100' <= char <= '\u312f' or \
|
|
|
|
|
'\u31a0' <= char <= '\u31bf':
|
2025-03-31 10:07:03 +08:00
|
|
|
|
flag = 1
|
|
|
|
|
else:
|
2025-03-19 21:02:49 +08:00
|
|
|
|
return False
|
2025-03-21 16:09:46 +08:00
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
2025-03-31 10:07:03 +08:00
|
|
|
|
def extract_list(text: str) -> Optional[str]:
|
|
|
|
|
"""从文本中提取方括号内的内容"""
|
|
|
|
|
if not isinstance(text, str):
|
|
|
|
|
return None
|
|
|
|
|
|
2025-03-21 16:09:46 +08:00
|
|
|
|
try:
|
2025-03-31 10:07:03 +08:00
|
|
|
|
pattern = r'\[(.*?)\]'
|
|
|
|
|
matches = re.findall(pattern, text)
|
|
|
|
|
|
|
|
|
|
if not matches:
|
|
|
|
|
return None
|
|
|
|
|
|
2025-03-21 16:09:46 +08:00
|
|
|
|
return matches[-1]
|
|
|
|
|
except Exception as e:
|
2025-03-31 10:07:03 +08:00
|
|
|
|
logger.error(f"字符串处理异常: {e}")
|
2025-03-21 16:09:46 +08:00
|
|
|
|
return None
|
|
|
|
|
|
2025-03-31 10:07:03 +08:00
|
|
|
|
def translate(query: str) -> Optional[str]:
|
|
|
|
|
"""
|
|
|
|
|
将查询中的英文翻译为中文。
|
|
|
|
|
如果提取列表为空,最多重试三次。
|
|
|
|
|
"""
|
|
|
|
|
if not query or not isinstance(query, str):
|
|
|
|
|
return None
|
|
|
|
|
|
2025-03-21 16:09:46 +08:00
|
|
|
|
sys_template = '''
|
|
|
|
|
你是一个专注于化工、环境学科领域的翻译专家。
|
|
|
|
|
用户将提供一个生命周期评价领域数据库的查询,查询可能包含中英文字符。你的任务是:
|
|
|
|
|
|
|
|
|
|
1. 将查询中的所有英文表述转化为对应的中文表述;
|
|
|
|
|
2. 确保转化后的查询中不含任何非中文语言;
|
2025-03-31 10:07:03 +08:00
|
|
|
|
3. 将完整的中文查询以"[]"格式返回;
|
|
|
|
|
4. 不返回除"[]"格式外的任何其他内容。
|
2025-03-21 16:09:46 +08:00
|
|
|
|
请严格按照上述要求执行。
|
|
|
|
|
'''
|
|
|
|
|
human_template = "查询内容为:{context}"
|
|
|
|
|
|
|
|
|
|
chat_prompt = ChatPromptTemplate.from_messages([
|
|
|
|
|
("system", sys_template),
|
|
|
|
|
("human", human_template)
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
messages = chat_prompt.format_messages(context=query)
|
2025-03-31 10:07:03 +08:00
|
|
|
|
|
2025-03-21 16:09:46 +08:00
|
|
|
|
llm = ChatOpenAI(
|
2025-03-31 10:07:03 +08:00
|
|
|
|
model="deepseek-chat",
|
2025-03-21 16:09:46 +08:00
|
|
|
|
base_url="https://api.deepseek.com",
|
|
|
|
|
api_key="sk-3e42e538bc39411ab80761106d83dda9",
|
|
|
|
|
temperature=0,
|
|
|
|
|
)
|
2025-03-31 10:07:03 +08:00
|
|
|
|
|
|
|
|
|
# 最多尝试三次
|
|
|
|
|
max_attempts = 3
|
|
|
|
|
|
|
|
|
|
for attempt in range(1, max_attempts + 1):
|
|
|
|
|
logger.info(f"翻译尝试 {attempt}/{max_attempts}:{query[:50]}{'...' if len(query) > 50 else ''}")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 调用API获取翻译结果
|
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
|
content = response.content
|
|
|
|
|
|
|
|
|
|
# 尝试提取结果
|
|
|
|
|
result = extract_list(content)
|
|
|
|
|
|
|
|
|
|
# 如果成功提取到结果,直接返回
|
|
|
|
|
if result is not None:
|
|
|
|
|
logger.info(f"成功提取翻译结果 (尝试 {attempt}/{max_attempts})")
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
# 提取失败,记录信息
|
|
|
|
|
logger.warning(f"未能提取翻译结果 (尝试 {attempt}/{max_attempts}): {content[:100]}")
|
|
|
|
|
|
|
|
|
|
# 如果已经是最后一次尝试,则返回None
|
|
|
|
|
if attempt == max_attempts:
|
|
|
|
|
logger.error("所有尝试均失败,无法获取有效翻译结果")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# 短暂等待后继续下一次尝试
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"翻译过程中发生异常 (尝试 {attempt}/{max_attempts}): {e}")
|
|
|
|
|
|
|
|
|
|
if attempt == max_attempts:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# 使用示例
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
query = "HCOOH"
|
|
|
|
|
result = translate(query)
|
|
|
|
|
if result:
|
|
|
|
|
print(f"翻译结果: {result}")
|
|
|
|
|
else:
|
|
|
|
|
print("翻译失败")
|
2025-03-21 16:09:46 +08:00
|
|
|
|
|