LCA-LLM/LCA_RAG/utils.py

124 lines
4.7 KiB
Python

import streamlit as st
import tempfile
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import MessagesPlaceholder
from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader, JSONLoader
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_community.chat_models.tongyi import ChatTongyi
from langchain_community.chat_models import ChatZhipuAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
import requests
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = "cuda"
memory = 6
def DataAnalysis(data):
return
def chroma_save_upload(path):
try:
# Load the docs
file_type = os.path.basename(path).split('.')[1]
loader = None
doc = None
if file_type == "txt":
loader = TextLoader(path, encoding="utf-8")
elif file_type == "pdf":
loader = PyPDFLoader(path)
elif file_type == "csv": # 传入大模型进行数据分析
data = pd.read_csv(path,encoding="utf-8")
DataAnalysis(data)
elif file_type == "json":
json_data = requests.get(path).json()
splitter = RecursiveJsonSplitter(max_chunk_size=300)
doc = splitter.create_documents(texts=[json_data])
if doc is None:
doc = loader.load()
# Split the doc content
tex_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = tex_splitter.split_documents(doc)
# Store the content
embedding = HuggingFaceEmbeddings(model_name="/home/zhangxj/models/acge_text_embedding", model_kwargs={"device": device})
vs = Chroma.from_documents(documents=splits, embedding=embedding, persist_directory="chromadb")
vs.add_documents(documents=splits)
vs.as_retriever()
print("Upload Files saved: " + str(path))
except Exception as e:
print(f"Error in chroma_save_upload: {e}")
@st.cache_resource(ttl="1h")
def configure_retriever(uploaded_files):
try:
# 读取上传的文档,并写入一个临时目录
temp_dir = tempfile.TemporaryDirectory(dir="/home/zhangxj/WorkFile/LCA-GPT/tmp")
for file in uploaded_files:
temp_filepath = os.path.join(temp_dir.name, file.name)
print("文档路径:", temp_filepath)
with open(temp_filepath, "wb") as f:
f.write(file.getvalue())
chroma_save_upload(path=temp_filepath)
except Exception as e:
print(f"Error in configure_retriever: {e}")
def init_chain():
try:
# 加载环境变量并初始化模型
load_dotenv(".env")
os.environ["DASHSCOPE_API_KEY"] = 'sk-c5f441f863f44094b0ddb96c831b5002'
llm = ChatTongyi(
streaming=True,
model='qwen1.5-72b-chat'
)
# llm = ChatZhipuAI(
# streaming=True,
# api_key="434790cf952335f18b6347e7b6de9777.V50p55zfk8Ye4ojV", # 在这里传入KEY
# model="cogview-3",
# )
embedding = HuggingFaceEmbeddings(model_name="/home/zhangxj/models/acge_text_embedding", model_kwargs={"device": device})
retriever = Chroma(persist_directory="chromadb", embedding_function=embedding)
retriever = retriever.as_retriever()
instruct_system_prompt = (
"你是生命周期领域富有经验的专家。"
"你要利用检索到的上下文来回答问题。如果上下文没有足够的信息,请说明。"
"如果你有不明白的地方,请向用户询问。"
"涉及生命后期评价领域的问题,你应该完整地引用文献资料。\n\n"
"{context}"
)
instruct_prompt = ChatPromptTemplate.from_messages(
[
("system", instruct_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
qa_chain = create_stuff_documents_chain(llm, instruct_prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)
return rag_chain
except Exception as e:
print(f"Error in init_chain: {e}")
return None
def user_in(uin, rag_chain, history):
try:
result = rag_chain.invoke({"input": uin, "chat_history": history})["answer"]
return result
except Exception as e:
print(f"Error in user_in: {e}")
return "An error occurred while processing your request."