building-agents/llma/rag/constraints_to_vector_db.ipynb

149 lines
4.2 KiB
Plaintext
Raw Permalink Normal View History

2024-11-22 10:03:31 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "813059a4b718b2ab23e367711ce57bc2-1",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"root_path = Path(\".\")\n",
"while not (root_path/\".git\").exists():\n",
" root_path = root_path.absolute().parent\n",
"sys.path.append(str(root_path/\"data\"/\"rag\"))\n",
"from rag_utils import constraint_path, problem_descriptions_vector_db_path, constraint_vector_db_path, objective_descriptions_vector_db_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c13e72512dc4927c1270dcf73358886-1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from typing import Dict, List\n",
"from langchain_chroma import Chroma\n",
"from langchain.schema.document import Document\n",
"from langchain_openai import OpenAIEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a25228df259f2c328e7b5109b329dae7-1",
"metadata": {},
"outputs": [],
"source": [
"constraint_df = pd.read_pickle(constraint_path)\n",
"constraint_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7a6dec35ab195c6fc5b09ac3c6f037c-1",
"metadata": {},
"outputs": [],
"source": [
"constraint_df.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07856d19ee765f9bfd6da9672627d811-1",
"metadata": {},
"outputs": [],
"source": [
"unique_problems_df = constraint_df[[\"description\", \"problem_name\"]].drop_duplicates()\n",
"unique_problems_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1777acedb60f98c1fb344128d8fe013-1",
"metadata": {},
"outputs": [],
"source": [
"def make_vector_db(data: Dict[str, str], vector_db_path: Path, model_name: str = \"text-embedding-3-large\"):\n",
" \"\"\"\n",
" Creates a vector database from a dictionary of strings.\n",
"\n",
" Args:\n",
" data (Dict[str, str]): A dictionary where keys are identifiers and values are strings.\n",
" vector_db_path (Path): The path to save the vector database.\n",
" model_name (str): The model name for generating embeddings.\n",
" \"\"\"\n",
" embedding_function = OpenAIEmbeddings(model=model_name)\n",
" docs = [Document(page_content=value, metadata={\"key\": key}) for key, value in data.items()]\n",
" if vector_db_path.exists():\n",
" vector_db_path.unlink()\n",
" vector_db_path.mkdir(exist_ok=True, parents=True)\n",
" Chroma.from_documents(docs, embedding_function, persist_directory=str(vector_db_path))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "563afd8fbdc7c39ff026870f2a42f089-1",
"metadata": {},
"outputs": [],
"source": [
"unique_problems_dict = unique_problems_df.set_index('problem_name')['description'].to_dict()\n",
"unique_problems_dict\n",
"\n",
"make_vector_db(unique_problems_dict, problem_descriptions_vector_db_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4ae995bf63155e7b78db737be2c68f5-1",
"metadata": {},
"outputs": [],
"source": [
"constraint_dict = constraint_df.constraint_description.to_dict()\n",
"constraint_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5301b48fab7028dc02c356b8c9feb0e3-1",
"metadata": {},
"outputs": [],
"source": [
"make_vector_db(constraint_dict, constraint_vector_db_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e24d96c962c61e100e1fdc0e68f87808-1",
"metadata": {},
"outputs": [],
"source": [
"objectives_dict = constraint_df[['objective_description', 'problem_name']].drop_duplicates().set_index('problem_name')['objective_description'].to_dict()\n",
"objectives_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ac6d38c20a87407ce015c655157fb72-1",
"metadata": {},
"outputs": [],
"source": [
"make_vector_db(objectives_dict, objective_descriptions_vector_db_path)"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}