添加翻译
This commit is contained in:
parent
4998a82e39
commit
426b755c33
|
@ -21,15 +21,12 @@ model = EmbeddingModel(path)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def process_and_embed(sentence):
|
def process_and_embed(sentence):
|
||||||
# 判断是否为中文
|
# 字符串全部转化为中文处理
|
||||||
if has_no_chinese(sentence):
|
sentence = translate(sentence)
|
||||||
# 英文处理
|
|
||||||
clean_text = preprocess_eng(sentence)
|
# 中文预处理
|
||||||
processed_text = get_noun_en(clean_text)
|
clean_text = preprocess_zh(sentence)
|
||||||
else:
|
processed_text = get_noun_zh(clean_text)
|
||||||
# 中文处理
|
|
||||||
clean_text = preprocess_zh(sentence)
|
|
||||||
processed_text = get_noun_zh(clean_text)
|
|
||||||
|
|
||||||
# 如果处理后为空,使用原始文本
|
# 如果处理后为空,使用原始文本
|
||||||
if not processed_text.strip():
|
if not processed_text.strip():
|
||||||
|
|
|
@ -3,14 +3,16 @@ from nltk.tokenize import word_tokenize
|
||||||
from nltk import pos_tag
|
from nltk import pos_tag
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
# 下载相关数据
|
# # 下载相关数据
|
||||||
nltk.download('punkt')
|
# nltk.download('punkt')
|
||||||
nltk.download('averaged_perceptron_tagger')
|
# nltk.download('averaged_perceptron_tagger')
|
||||||
|
|
||||||
|
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
|
from langchain.prompts import ChatPromptTemplate
|
||||||
|
from langchain.schema import SystemMessage, HumanMessage
|
||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
|
||||||
def preprocess_eng(text):
|
def preprocess_eng(text):
|
||||||
'''
|
'''
|
||||||
|
@ -85,3 +87,48 @@ def has_no_chinese(text):
|
||||||
'\u31a0' <= char <= '\u31bf':
|
'\u31a0' <= char <= '\u31bf':
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def extract_List(text):
|
||||||
|
pattern = r'\[(.*?)\]'
|
||||||
|
matches = re.findall(pattern,text)
|
||||||
|
try:
|
||||||
|
return matches[-1]
|
||||||
|
except Exception as e:
|
||||||
|
print("字符串处理异常!",e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def translate(query):
|
||||||
|
sys_template = '''
|
||||||
|
你是一个专注于化工、环境学科领域的翻译专家。
|
||||||
|
用户将提供一个生命周期评价领域数据库的查询,查询可能包含中英文字符。你的任务是:
|
||||||
|
|
||||||
|
1. 将查询中的所有英文表述转化为对应的中文表述;
|
||||||
|
2. 确保转化后的查询中不含任何非中文语言;
|
||||||
|
3. 将完整的中文查询以“[]”格式返回;
|
||||||
|
4. 不返回除“[]”格式外的任何其他内容。
|
||||||
|
请严格按照上述要求执行。
|
||||||
|
'''
|
||||||
|
human_template = "查询内容为:{context}"
|
||||||
|
|
||||||
|
chat_prompt = ChatPromptTemplate.from_messages([
|
||||||
|
("system", sys_template),
|
||||||
|
("human", human_template)
|
||||||
|
])
|
||||||
|
|
||||||
|
messages = chat_prompt.format_messages(context=query)
|
||||||
|
# print(messages)
|
||||||
|
llm = ChatOpenAI(
|
||||||
|
model = "deepseek-chat",
|
||||||
|
base_url="https://api.deepseek.com",
|
||||||
|
api_key="sk-3e42e538bc39411ab80761106d83dda9",
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
|
response = llm.invoke(messages)
|
||||||
|
content = response.content
|
||||||
|
result = extract_List(content)
|
||||||
|
return result
|
||||||
|
if __name__ == '__main__':
|
||||||
|
res = translate("HCOOH的定义是什么?")
|
||||||
|
print(res)
|
||||||
|
|
Loading…
Reference in New Issue