LCA-LLM/LCA_RAG/QAdata.py

104 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import time
from qwen_agent.agents import Assistant
from pprint import pprint
import json
import re
'''
生成QAdata后续不用
'''
def extract_qa(s):
# 使用正则表达式提取 questions 和 answers
questions = re.findall(r'#([^#]*?\?)', s)
answers = re.findall(r'@([^@]*?\。)', s)
if len(questions) == 0:
questions = re.findall(r'#([^#]*?)', s)
print("Q:",questions)
print("A:",answers)
return questions, answers
# 去除长度小于5的元素
def filter_short_answers(questions, answers, min_length=8):
# 遍历 answers 列表,检查每个答案的长度
filtered_questions = []
filtered_answers = []
for question, answer in zip(questions, answers):
if len(answer) >= min_length and len(question) >= min_length:
filtered_questions.append(question)
filtered_answers.append(answer)
return filtered_questions, filtered_answers
def write_to_file(filename, data_list):
with open(filename, 'a', encoding='utf-8') as file:
for item in data_list:
file.write(item + '\n')
data = "/home/zhangxj/WorkFile/LCA-GPT/split_LCAdata/folder6"
docs = os.listdir(data)
llm_cfg = {
'model': 'qwen-plus',
'model_server': 'dashscope',
'api_key': "sk-c5f441f863f44094b0ddb96c831b5002",
}
system_instruction = '''你是一位专注于生命周期分析LCA领域的数据分析助手。在LCA领域的目标和范围定义、数据清单收集和分析、生命周期影响评价、结果分析和政策建议等方面有着丰富的经验和知识。
请根据下面的文档提出10个问题及其相应的答案规定每个问题的字符数量为x答案的字符数量为y
"x>40 & x<70;y>40 & y<70"
10个question结果的输出为10个字符串"#问题1:"开头;
10个对应的answer结果输出为10个字符串"@答案1:"开头,答案以""结尾,不要换行。
'''
tools = ['code_interpreter'] # `code_interpreter` is a built-in tool for executing code.
messages = [] # This stores the chat history.
questions = []
answers = []
# Process each document
for doc in docs:
doc_path = os.path.join(data, doc)
files = [doc_path]
prompt = "分析这篇文章根据文章研究的内容并按照格式输出10个与LCA领域相关的问题和相应的答案。"
messages.append({'role': 'user', 'content': prompt})
assistant = Assistant(llm=llm_cfg,
system_message=system_instruction,
# function_list=tools,
files=files)
response = []
for response in assistant.run(messages=messages):
continue
# pprint(response)
content = response[0]['content']
content = content.replace('\n', '')
print(content)
# print(type(content))
question, answer = extract_qa(content)
filterq,filtera = filter_short_answers(question,answer)
questions.extend(filterq)
answers.extend(filtera)
file1 = "/home/zhangxj/WorkFile/LCA-GPT/QA/originData/ques.txt"
file2 = "/home/zhangxj/WorkFile/LCA-GPT/QA/originData/answer.txt"
write_to_file(file1,filterq)
write_to_file(file2,filtera)
# print(answers)
# Pause for a while to avoid hitting API rate limits
time.sleep(3)
# Print the final results
# print("Final Questions List:")
# pprint(questions)
# print("\nFinal Answers List:")
# pprint(answers)