149 lines
4.2 KiB
Plaintext
149 lines
4.2 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "813059a4b718b2ab23e367711ce57bc2-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import sys\n",
|
||
|
"from pathlib import Path\n",
|
||
|
"root_path = Path(\".\")\n",
|
||
|
"while not (root_path/\".git\").exists():\n",
|
||
|
" root_path = root_path.absolute().parent\n",
|
||
|
"sys.path.append(str(root_path/\"data\"/\"rag\"))\n",
|
||
|
"from rag_utils import constraint_path, problem_descriptions_vector_db_path, constraint_vector_db_path, objective_descriptions_vector_db_path"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "3c13e72512dc4927c1270dcf73358886-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"from typing import Dict, List\n",
|
||
|
"from langchain_chroma import Chroma\n",
|
||
|
"from langchain.schema.document import Document\n",
|
||
|
"from langchain_openai import OpenAIEmbeddings"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "a25228df259f2c328e7b5109b329dae7-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"constraint_df = pd.read_pickle(constraint_path)\n",
|
||
|
"constraint_df"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "a7a6dec35ab195c6fc5b09ac3c6f037c-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"constraint_df.columns"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "07856d19ee765f9bfd6da9672627d811-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"unique_problems_df = constraint_df[[\"description\", \"problem_name\"]].drop_duplicates()\n",
|
||
|
"unique_problems_df"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "d1777acedb60f98c1fb344128d8fe013-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def make_vector_db(data: Dict[str, str], vector_db_path: Path, model_name: str = \"text-embedding-3-large\"):\n",
|
||
|
" \"\"\"\n",
|
||
|
" Creates a vector database from a dictionary of strings.\n",
|
||
|
"\n",
|
||
|
" Args:\n",
|
||
|
" data (Dict[str, str]): A dictionary where keys are identifiers and values are strings.\n",
|
||
|
" vector_db_path (Path): The path to save the vector database.\n",
|
||
|
" model_name (str): The model name for generating embeddings.\n",
|
||
|
" \"\"\"\n",
|
||
|
" embedding_function = OpenAIEmbeddings(model=model_name)\n",
|
||
|
" docs = [Document(page_content=value, metadata={\"key\": key}) for key, value in data.items()]\n",
|
||
|
" if vector_db_path.exists():\n",
|
||
|
" vector_db_path.unlink()\n",
|
||
|
" vector_db_path.mkdir(exist_ok=True, parents=True)\n",
|
||
|
" Chroma.from_documents(docs, embedding_function, persist_directory=str(vector_db_path))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "563afd8fbdc7c39ff026870f2a42f089-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"unique_problems_dict = unique_problems_df.set_index('problem_name')['description'].to_dict()\n",
|
||
|
"unique_problems_dict\n",
|
||
|
"\n",
|
||
|
"make_vector_db(unique_problems_dict, problem_descriptions_vector_db_path)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "d4ae995bf63155e7b78db737be2c68f5-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"constraint_dict = constraint_df.constraint_description.to_dict()\n",
|
||
|
"constraint_dict"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "5301b48fab7028dc02c356b8c9feb0e3-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"make_vector_db(constraint_dict, constraint_vector_db_path)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "e24d96c962c61e100e1fdc0e68f87808-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"objectives_dict = constraint_df[['objective_description', 'problem_name']].drop_duplicates().set_index('problem_name')['objective_description'].to_dict()\n",
|
||
|
"objectives_dict"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "8ac6d38c20a87407ce015c655157fb72-1",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"make_vector_db(objectives_dict, objective_descriptions_vector_db_path)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|