building-agents/llma/rag/extract_constraints_from_lo...

182 lines
6.0 KiB
Plaintext
Raw Permalink Normal View History

2024-11-22 10:03:31 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "cdbca093aac051e26e17272d0e199954-1",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"root_path = Path(\".\")\n",
"while not (root_path/\".git\").exists():\n",
" root_path = root_path.absolute().parent\n",
"sys.path.append(str(root_path/\"data\"/\"rag\"))\n",
"from rag_utils import logs_path, constraint_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "998bf3c5f3ec65213e0dcf42fc88eb63-1",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import json\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "f89dc0bd1f3e913c2fe639bdc02fd74f-1",
"metadata": {},
"source": [
"Find all log paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e3bbe0261cd1d62992907e8d71be8d4-1",
"metadata": {},
"outputs": [],
"source": [
"problem_paths = sorted(logs_path.glob(\"*/\"), key=lambda x: int(x.name))\n",
"problem_paths[:5]\n",
"\n",
"real_logs = []\n",
"skipped = []\n",
"\n",
"for problem_path in problem_paths:\n",
" gpt4o_log_path = list(problem_path.glob(\"run*_gpt-4o*\"))\n",
" \n",
" if len(gpt4o_log_path) == 0:\n",
" skipped.append(problem_path)\n",
" elif len(gpt4o_log_path) > 1:\n",
" newest_log_path = max(gpt4o_log_path, key=lambda p: p.stat().st_mtime)\n",
" real_logs.append(newest_log_path)\n",
" else:\n",
" real_logs.append(gpt4o_log_path[0])\n",
"\n",
"assert len(skipped) == 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f22e22f10ae3fce36d0d1f69fb0eeac-1",
"metadata": {},
"outputs": [],
"source": [
"missing_files_in_log_folder = []\n",
"\n",
"for idx in range(len(real_logs)):\n",
" # idx = random.randint(0, len(real_logs))\n",
" names = sorted(real_logs[idx].glob(\"*\"), key=lambda x: int(\"0\"+\"\".join(re.findall(\"\\\\d\", x.name))))\n",
" if len(names) == 0:\n",
" missing_files_in_log_folder.append(real_logs[idx])\n",
" continue\n",
" print([x.name for x in names][-1])\n",
" # print()\n",
"\n",
"missing_files_in_log_folder"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d374627207c37fd7424a9aeccda1c4e2-1",
"metadata": {},
"outputs": [],
"source": [
"def extract_solution_instance_status(file_path: Path):\n",
" lines = file_path.read_text().splitlines()\n",
" \n",
" extracted_data = []\n",
"\n",
" for i, line in enumerate(lines):\n",
" if i == 0:\n",
" continue\n",
" parts = line.split()\n",
" if len(parts) >= 4:\n",
" instance_number = int(parts[0])\n",
" status = parts[3] == \"Solved\"\n",
" extracted_data.append((instance_number, status))\n",
"\n",
" return [x[0] for x in extracted_data if x[1]]\n",
"\n",
"# Path to the input file\n",
"file_path = logs_path/'status.txt'\n",
"file_path\n",
"\n",
"# Extract the solution instance number and Status\n",
"solved_problems = extract_solution_instance_status(file_path)\n",
"solved_problems"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d721badf71c5a6669badeab3acf97dd-1",
"metadata": {},
"outputs": [],
"source": [
"problem_objectives_formulations_and_labels = []\n",
"for log_path in real_logs:\n",
" if (state_6_path := (log_path/\"state_6_code.json\")).exists():\n",
" if int(log_path.parent.name) not in solved_problems:\n",
" continue\n",
" labels = json.loads((log_path.parent/\"labels.json\").read_text())\n",
" description = (log_path.parent/\"desc.txt\").read_text()\n",
" data = json.loads(state_6_path.read_text())\n",
" objective_description = data[\"objective\"][\"description\"]\n",
" objective_formulation = data[\"objective\"][\"formulation\"]\n",
" constraints = []\n",
" for constraint in data[\"constraints\"]:\n",
" constraints.append({\"description\": constraint[\"description\"], \"formulation\": constraint[\"formulation\"]})\n",
" problem_objectives_formulations_and_labels.append({\n",
" \"objective_description\": objective_description,\n",
" \"objective_formulation\": objective_formulation,\n",
" \"constraints\": constraints,\n",
" \"labels\": labels,\n",
" \"description\": description,\n",
" \"problem_name\": log_path.parent.name\n",
" })\n",
"\n",
"problem_objectives_formulations_and_labels[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "571b12c459bf920f6c2d04a7b97ffa8f-1",
"metadata": {},
"outputs": [],
"source": [
"constraint_data = []\n",
"total_constraints = 0\n",
"for data in problem_objectives_formulations_and_labels:\n",
" total_constraints += len(data[\"constraints\"])\n",
" for data_constraint in data[\"constraints\"]:\n",
" constraint_data.append({\n",
" \"objective_description\": data[\"objective_description\"],\n",
" \"objective_formulation\": data[\"objective_formulation\"],\n",
" \"constraint_description\": data_constraint[\"description\"],\n",
" \"constraint_formulation\": data_constraint[\"formulation\"],\n",
" \"labels\": data[\"labels\"],\n",
" \"description\": data[\"description\"],\n",
" \"problem_name\": data[\"problem_name\"]\n",
" })\n",
"\n",
"constraints_df = pd.DataFrame(constraint_data)\n",
"constraints_df.to_pickle(constraint_path)"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}