{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdbca093aac051e26e17272d0e199954-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "root_path = Path(\".\")\n",
    "while not (root_path/\".git\").exists():\n",
    "    root_path = root_path.absolute().parent\n",
    "sys.path.append(str(root_path/\"data\"/\"rag\"))\n",
    "from rag_utils import logs_path, constraint_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "998bf3c5f3ec65213e0dcf42fc88eb63-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import json\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f89dc0bd1f3e913c2fe639bdc02fd74f-1",
   "metadata": {},
   "source": [
    "Find all log paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e3bbe0261cd1d62992907e8d71be8d4-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "problem_paths = sorted(logs_path.glob(\"*/\"), key=lambda x: int(x.name))\n",
    "problem_paths[:5]\n",
    "\n",
    "real_logs = []\n",
    "skipped = []\n",
    "\n",
    "for problem_path in problem_paths:\n",
    "    gpt4o_log_path = list(problem_path.glob(\"run*_gpt-4o*\"))\n",
    "    \n",
    "    if len(gpt4o_log_path) == 0:\n",
    "        skipped.append(problem_path)\n",
    "    elif len(gpt4o_log_path) > 1:\n",
    "        newest_log_path = max(gpt4o_log_path, key=lambda p: p.stat().st_mtime)\n",
    "        real_logs.append(newest_log_path)\n",
    "    else:\n",
    "        real_logs.append(gpt4o_log_path[0])\n",
    "\n",
    "assert len(skipped) == 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f22e22f10ae3fce36d0d1f69fb0eeac-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "missing_files_in_log_folder = []\n",
    "\n",
    "for idx in range(len(real_logs)):\n",
    "    #  idx = random.randint(0, len(real_logs))\n",
    "    names = sorted(real_logs[idx].glob(\"*\"), key=lambda x: int(\"0\"+\"\".join(re.findall(\"\\\\d\", x.name))))\n",
    "    if len(names) == 0:\n",
    "        missing_files_in_log_folder.append(real_logs[idx])\n",
    "        continue\n",
    "    print([x.name for x in names][-1])\n",
    "    #  print()\n",
    "\n",
    "missing_files_in_log_folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d374627207c37fd7424a9aeccda1c4e2-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_solution_instance_status(file_path: Path):\n",
    "    lines = file_path.read_text().splitlines()\n",
    "    \n",
    "    extracted_data = []\n",
    "\n",
    "    for i, line in enumerate(lines):\n",
    "        if i == 0:\n",
    "            continue\n",
    "        parts = line.split()\n",
    "        if len(parts) >= 4:\n",
    "            instance_number = int(parts[0])\n",
    "            status = parts[3] == \"Solved\"\n",
    "            extracted_data.append((instance_number, status))\n",
    "\n",
    "    return [x[0] for x in extracted_data if x[1]]\n",
    "\n",
    "# Path to the input file\n",
    "file_path = logs_path/'status.txt'\n",
    "file_path\n",
    "\n",
    "# Extract the solution instance number and Status\n",
    "solved_problems = extract_solution_instance_status(file_path)\n",
    "solved_problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d721badf71c5a6669badeab3acf97dd-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "problem_objectives_formulations_and_labels = []\n",
    "for log_path in real_logs:\n",
    "    if (state_6_path := (log_path/\"state_6_code.json\")).exists():\n",
    "        if int(log_path.parent.name) not in solved_problems:\n",
    "            continue\n",
    "        labels = json.loads((log_path.parent/\"labels.json\").read_text())\n",
    "        description = (log_path.parent/\"desc.txt\").read_text()\n",
    "        data = json.loads(state_6_path.read_text())\n",
    "        objective_description = data[\"objective\"][\"description\"]\n",
    "        objective_formulation = data[\"objective\"][\"formulation\"]\n",
    "        constraints = []\n",
    "        for constraint in data[\"constraints\"]:\n",
    "            constraints.append({\"description\": constraint[\"description\"], \"formulation\": constraint[\"formulation\"]})\n",
    "        problem_objectives_formulations_and_labels.append({\n",
    "            \"objective_description\": objective_description,\n",
    "            \"objective_formulation\": objective_formulation,\n",
    "            \"constraints\": constraints,\n",
    "            \"labels\": labels,\n",
    "            \"description\": description,\n",
    "            \"problem_name\": log_path.parent.name\n",
    "        })\n",
    "\n",
    "problem_objectives_formulations_and_labels[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "571b12c459bf920f6c2d04a7b97ffa8f-1",
   "metadata": {},
   "outputs": [],
   "source": [
    "constraint_data = []\n",
    "total_constraints = 0\n",
    "for data in problem_objectives_formulations_and_labels:\n",
    "    total_constraints += len(data[\"constraints\"])\n",
    "    for data_constraint in data[\"constraints\"]:\n",
    "        constraint_data.append({\n",
    "            \"objective_description\": data[\"objective_description\"],\n",
    "            \"objective_formulation\": data[\"objective_formulation\"],\n",
    "            \"constraint_description\": data_constraint[\"description\"],\n",
    "            \"constraint_formulation\": data_constraint[\"formulation\"],\n",
    "            \"labels\": data[\"labels\"],\n",
    "            \"description\": data[\"description\"],\n",
    "            \"problem_name\": data[\"problem_name\"]\n",
    "        })\n",
    "\n",
    "constraints_df = pd.DataFrame(constraint_data)\n",
    "constraints_df.to_pickle(constraint_path)"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}