LCA-LLM/LCA_RAG/chroma.py

52 lines
1.5 KiB
Python

'''
生成数据库
'''
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import sys
os.chdir(sys.path[0])
# Load the enivronment variables and API keys
load_dotenv(".env")
key = 1
# Call the model and parser for extracting the output of LLM
parser = StrOutputParser()
# Store the vectrostores locally
def chroma_save(path,key=key):
#Load the docs
loader = PyPDFLoader(path)
doc = loader.load()
# Split the doc content
tex_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
splits = tex_splitter.split_documents(doc)
# Store the content
embedding = HuggingFaceEmbeddings(model_name = "/home/zhangxj/models/acge_text_embedding")
vs = Chroma.from_documents(documents=splits, embedding=embedding, persist_directory="chroma_new")
vs.add_documents(documents=splits)
vs.as_retriever()
print("saved: " + str(path))
def main():
data = "/home/zhangxj/WorkFile/LCA-GPT/LCAdata" #os.path.join('resources', 'pdfs')
docs = os.listdir(data)
# save every file to the loacal database
for doc in docs:
doc_path = os.path.join(data,doc)
chroma_save(doc_path)
if __name__ == "__main__":
main()