diff --git a/Retrieval_new/embedding.py b/Retrieval_new/embedding.py index 32a1184..ee88807 100644 --- a/Retrieval_new/embedding.py +++ b/Retrieval_new/embedding.py @@ -20,17 +20,14 @@ path = "/home/zhangxj/models/bge-large-zh-v1.5" model = EmbeddingModel(path) @lru_cache() -def process_and_embed(sentence): - # 判断是否为中文 - if has_no_chinese(sentence): - # 英文处理 - clean_text = preprocess_eng(sentence) - processed_text = get_noun_en(clean_text) - else: - # 中文处理 - clean_text = preprocess_zh(sentence) - processed_text = get_noun_zh(clean_text) - +def process_and_embed(sentence): + # 字符串全部转化为中文处理 + sentence = translate(sentence) + + # 中文预处理 + clean_text = preprocess_zh(sentence) + processed_text = get_noun_zh(clean_text) + # 如果处理后为空,使用原始文本 if not processed_text.strip(): processed_text = sentence diff --git a/Retrieval_new/utils.py b/Retrieval_new/utils.py index a8fc4f0..4fb0213 100644 --- a/Retrieval_new/utils.py +++ b/Retrieval_new/utils.py @@ -3,14 +3,16 @@ from nltk.tokenize import word_tokenize from nltk import pos_tag import jieba.posseg as pseg -# 下载相关数据 -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') - +# # 下载相关数据 +# nltk.download('punkt') +# nltk.download('averaged_perceptron_tagger') from nltk.stem import WordNetLemmatizer import string import re +from langchain.prompts import ChatPromptTemplate +from langchain.schema import SystemMessage, HumanMessage +from langchain_openai import ChatOpenAI def preprocess_eng(text): ''' @@ -84,4 +86,49 @@ def has_no_chinese(text): '\u3100' <= char <= '\u312f' or \ '\u31a0' <= char <= '\u31bf': return False - return True \ No newline at end of file + return True + + +def extract_List(text): + pattern = r'\[(.*?)\]' + matches = re.findall(pattern,text) + try: + return matches[-1] + except Exception as e: + print("字符串处理异常!",e) + return None + +def translate(query): + sys_template = ''' + 你是一个专注于化工、环境学科领域的翻译专家。 + 用户将提供一个生命周期评价领域数据库的查询,查询可能包含中英文字符。你的任务是: + + 1. 将查询中的所有英文表述转化为对应的中文表述; + 2. 确保转化后的查询中不含任何非中文语言; + 3. 将完整的中文查询以“[]”格式返回; + 4. 不返回除“[]”格式外的任何其他内容。 + 请严格按照上述要求执行。 + ''' + human_template = "查询内容为:{context}" + + chat_prompt = ChatPromptTemplate.from_messages([ + ("system", sys_template), + ("human", human_template) + ]) + + messages = chat_prompt.format_messages(context=query) + # print(messages) + llm = ChatOpenAI( + model = "deepseek-chat", + base_url="https://api.deepseek.com", + api_key="sk-3e42e538bc39411ab80761106d83dda9", + temperature=0, + ) + response = llm.invoke(messages) + content = response.content + result = extract_List(content) + return result +if __name__ == '__main__': + res = translate("HCOOH的定义是什么?") + print(res) + \ No newline at end of file