LCA-LLM/LCA_RAG/QAdata.py

import os
import time
from qwen_agent.agents import Assistant
from pprint import pprint
import json
import re

'''
生成QAdata，后续不用
'''
def extract_qa(s):
    # 使用正则表达式提取 questions 和 answers
    questions = re.findall(r'#([^#]*?\?)', s)
    answers = re.findall(r'@([^@]*?\。)', s)
    if len(questions) == 0:
        questions = re.findall(r'#([^#]*?？)', s)
    print("Q:",questions)
    print("A:",answers)
    return questions, answers

# 去除长度小于5的元素
def filter_short_answers(questions, answers, min_length=8):
    # 遍历 answers 列表，检查每个答案的长度
    filtered_questions = []
    filtered_answers = []

    for question, answer in zip(questions, answers):
        if len(answer) >= min_length and len(question) >= min_length:
            filtered_questions.append(question)
            filtered_answers.append(answer)

    return filtered_questions, filtered_answers

def write_to_file(filename, data_list):
    with open(filename, 'a', encoding='utf-8') as file:
        for item in data_list:
            file.write(item + '\n')


data = "/home/zhangxj/WorkFile/LCA-GPT/split_LCAdata/folder6"
docs = os.listdir(data)

llm_cfg = {
    'model': 'qwen-plus',
    'model_server': 'dashscope',
    'api_key': "sk-c5f441f863f44094b0ddb96c831b5002",
}

system_instruction = '''你是一位专注于生命周期分析（LCA）领域的数据分析助手。在LCA领域的目标和范围定义、数据清单收集和分析、生命周期影响评价、结果分析和政策建议等方面有着丰富的经验和知识。
请根据下面的文档提出10个问题及其相应的答案，规定每个问题的字符数量为x，答案的字符数量为y，
"x>40 & x<70;y>40 & y<70"
10个question结果的输出为10个字符串，以"#问题1:"开头；
10个对应的answer结果输出为10个字符串，以"@答案1:"开头，答案以"。"结尾,不要换行。

'''

tools = ['code_interpreter']  # `code_interpreter` is a built-in tool for executing code.
messages = []  # This stores the chat history.
questions = []
answers = []

# Process each document
for doc in docs:
    doc_path = os.path.join(data, doc)
    files = [doc_path]
    prompt = "分析这篇文章，根据文章研究的内容，并按照格式输出10个与LCA领域相关的问题和相应的答案。"
    messages.append({'role': 'user', 'content': prompt})
    assistant = Assistant(llm=llm_cfg,
                          system_message=system_instruction,
                          # function_list=tools,
                          files=files)
    response = []
    for response in assistant.run(messages=messages):
        continue

    # pprint(response)
    content = response[0]['content']
    content = content.replace('\n', '')
    print(content)
    # print(type(content))
    question, answer = extract_qa(content)
    filterq,filtera = filter_short_answers(question,answer)
    questions.extend(filterq)
    answers.extend(filtera)

    file1 = "/home/zhangxj/WorkFile/LCA-GPT/QA/originData/ques.txt"
    file2 = "/home/zhangxj/WorkFile/LCA-GPT/QA/originData/answer.txt"
    write_to_file(file1,filterq)
    write_to_file(file2,filtera)

    # print(answers)
    # Pause for a while to avoid hitting API rate limits
    time.sleep(3)

# Print the final results
# print("Final Questions List:")
# pprint(questions)

# print("\nFinal Answers List:")
# pprint(answers)