添加翻译

This commit is contained in:
zhangxiaojun 2025-03-21 16:09:46 +08:00
parent 4998a82e39
commit 426b755c33
2 changed files with 60 additions and 16 deletions

View File

@ -20,17 +20,14 @@ path = "/home/zhangxj/models/bge-large-zh-v1.5"
model = EmbeddingModel(path)
@lru_cache()
def process_and_embed(sentence):
# 判断是否为中文
if has_no_chinese(sentence):
# 英文处理
clean_text = preprocess_eng(sentence)
processed_text = get_noun_en(clean_text)
else:
# 中文处理
clean_text = preprocess_zh(sentence)
processed_text = get_noun_zh(clean_text)
def process_and_embed(sentence):
# 字符串全部转化为中文处理
sentence = translate(sentence)
# 中文预处理
clean_text = preprocess_zh(sentence)
processed_text = get_noun_zh(clean_text)
# 如果处理后为空,使用原始文本
if not processed_text.strip():
processed_text = sentence

View File

@ -3,14 +3,16 @@ from nltk.tokenize import word_tokenize
from nltk import pos_tag
import jieba.posseg as pseg
# 下载相关数据
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# # 下载相关数据
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
import string
import re
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
def preprocess_eng(text):
'''
@ -84,4 +86,49 @@ def has_no_chinese(text):
'\u3100' <= char <= '\u312f' or \
'\u31a0' <= char <= '\u31bf':
return False
return True
return True
def extract_List(text):
pattern = r'\[(.*?)\]'
matches = re.findall(pattern,text)
try:
return matches[-1]
except Exception as e:
print("字符串处理异常!",e)
return None
def translate(query):
sys_template = '''
你是一个专注于化工环境学科领域的翻译专家
用户将提供一个生命周期评价领域数据库的查询查询可能包含中英文字符你的任务是
1. 将查询中的所有英文表述转化为对应的中文表述
2. 确保转化后的查询中不含任何非中文语言
3. 将完整的中文查询以[]格式返回
4. 不返回除[]格式外的任何其他内容
请严格按照上述要求执行
'''
human_template = "查询内容为:{context}"
chat_prompt = ChatPromptTemplate.from_messages([
("system", sys_template),
("human", human_template)
])
messages = chat_prompt.format_messages(context=query)
# print(messages)
llm = ChatOpenAI(
model = "deepseek-chat",
base_url="https://api.deepseek.com",
api_key="sk-3e42e538bc39411ab80761106d83dda9",
temperature=0,
)
response = llm.invoke(messages)
content = response.content
result = extract_List(content)
return result
if __name__ == '__main__':
res = translate("HCOOH的定义是什么")
print(res)