{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "813059a4b718b2ab23e367711ce57bc2-1", "metadata": {}, "outputs": [], "source": [ "import sys\n", "from pathlib import Path\n", "root_path = Path(\".\")\n", "while not (root_path/\".git\").exists():\n", " root_path = root_path.absolute().parent\n", "sys.path.append(str(root_path/\"data\"/\"rag\"))\n", "from rag_utils import constraint_path, problem_descriptions_vector_db_path, constraint_vector_db_path, objective_descriptions_vector_db_path" ] }, { "cell_type": "code", "execution_count": null, "id": "3c13e72512dc4927c1270dcf73358886-1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from typing import Dict, List\n", "from langchain_chroma import Chroma\n", "from langchain.schema.document import Document\n", "from langchain_openai import OpenAIEmbeddings" ] }, { "cell_type": "code", "execution_count": null, "id": "a25228df259f2c328e7b5109b329dae7-1", "metadata": {}, "outputs": [], "source": [ "constraint_df = pd.read_pickle(constraint_path)\n", "constraint_df" ] }, { "cell_type": "code", "execution_count": null, "id": "a7a6dec35ab195c6fc5b09ac3c6f037c-1", "metadata": {}, "outputs": [], "source": [ "constraint_df.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "07856d19ee765f9bfd6da9672627d811-1", "metadata": {}, "outputs": [], "source": [ "unique_problems_df = constraint_df[[\"description\", \"problem_name\"]].drop_duplicates()\n", "unique_problems_df" ] }, { "cell_type": "code", "execution_count": null, "id": "d1777acedb60f98c1fb344128d8fe013-1", "metadata": {}, "outputs": [], "source": [ "def make_vector_db(data: Dict[str, str], vector_db_path: Path, model_name: str = \"text-embedding-3-large\"):\n", " \"\"\"\n", " Creates a vector database from a dictionary of strings.\n", "\n", " Args:\n", " data (Dict[str, str]): A dictionary where keys are identifiers and values are strings.\n", " vector_db_path (Path): The path to save the vector database.\n", " model_name (str): The model name for generating embeddings.\n", " \"\"\"\n", " embedding_function = OpenAIEmbeddings(model=model_name)\n", " docs = [Document(page_content=value, metadata={\"key\": key}) for key, value in data.items()]\n", " if vector_db_path.exists():\n", " vector_db_path.unlink()\n", " vector_db_path.mkdir(exist_ok=True, parents=True)\n", " Chroma.from_documents(docs, embedding_function, persist_directory=str(vector_db_path))" ] }, { "cell_type": "code", "execution_count": null, "id": "563afd8fbdc7c39ff026870f2a42f089-1", "metadata": {}, "outputs": [], "source": [ "unique_problems_dict = unique_problems_df.set_index('problem_name')['description'].to_dict()\n", "unique_problems_dict\n", "\n", "make_vector_db(unique_problems_dict, problem_descriptions_vector_db_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "d4ae995bf63155e7b78db737be2c68f5-1", "metadata": {}, "outputs": [], "source": [ "constraint_dict = constraint_df.constraint_description.to_dict()\n", "constraint_dict" ] }, { "cell_type": "code", "execution_count": null, "id": "5301b48fab7028dc02c356b8c9feb0e3-1", "metadata": {}, "outputs": [], "source": [ "make_vector_db(constraint_dict, constraint_vector_db_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "e24d96c962c61e100e1fdc0e68f87808-1", "metadata": {}, "outputs": [], "source": [ "objectives_dict = constraint_df[['objective_description', 'problem_name']].drop_duplicates().set_index('problem_name')['objective_description'].to_dict()\n", "objectives_dict" ] }, { "cell_type": "code", "execution_count": null, "id": "8ac6d38c20a87407ce015c655157fb72-1", "metadata": {}, "outputs": [], "source": [ "make_vector_db(objectives_dict, objective_descriptions_vector_db_path)" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }