{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "cdbca093aac051e26e17272d0e199954-1", "metadata": {}, "outputs": [], "source": [ "import sys\n", "from pathlib import Path\n", "root_path = Path(\".\")\n", "while not (root_path/\".git\").exists():\n", " root_path = root_path.absolute().parent\n", "sys.path.append(str(root_path/\"data\"/\"rag\"))\n", "from rag_utils import logs_path, constraint_path" ] }, { "cell_type": "code", "execution_count": null, "id": "998bf3c5f3ec65213e0dcf42fc88eb63-1", "metadata": {}, "outputs": [], "source": [ "import re\n", "import json\n", "import pandas as pd" ] }, { "cell_type": "markdown", "id": "f89dc0bd1f3e913c2fe639bdc02fd74f-1", "metadata": {}, "source": [ "Find all log paths" ] }, { "cell_type": "code", "execution_count": null, "id": "0e3bbe0261cd1d62992907e8d71be8d4-1", "metadata": {}, "outputs": [], "source": [ "problem_paths = sorted(logs_path.glob(\"*/\"), key=lambda x: int(x.name))\n", "problem_paths[:5]\n", "\n", "real_logs = []\n", "skipped = []\n", "\n", "for problem_path in problem_paths:\n", " gpt4o_log_path = list(problem_path.glob(\"run*_gpt-4o*\"))\n", " \n", " if len(gpt4o_log_path) == 0:\n", " skipped.append(problem_path)\n", " elif len(gpt4o_log_path) > 1:\n", " newest_log_path = max(gpt4o_log_path, key=lambda p: p.stat().st_mtime)\n", " real_logs.append(newest_log_path)\n", " else:\n", " real_logs.append(gpt4o_log_path[0])\n", "\n", "assert len(skipped) == 0" ] }, { "cell_type": "code", "execution_count": null, "id": "4f22e22f10ae3fce36d0d1f69fb0eeac-1", "metadata": {}, "outputs": [], "source": [ "missing_files_in_log_folder = []\n", "\n", "for idx in range(len(real_logs)):\n", " # idx = random.randint(0, len(real_logs))\n", " names = sorted(real_logs[idx].glob(\"*\"), key=lambda x: int(\"0\"+\"\".join(re.findall(\"\\\\d\", x.name))))\n", " if len(names) == 0:\n", " missing_files_in_log_folder.append(real_logs[idx])\n", " continue\n", " print([x.name for x in names][-1])\n", " # print()\n", "\n", "missing_files_in_log_folder" ] }, { "cell_type": "code", "execution_count": null, "id": "d374627207c37fd7424a9aeccda1c4e2-1", "metadata": {}, "outputs": [], "source": [ "def extract_solution_instance_status(file_path: Path):\n", " lines = file_path.read_text().splitlines()\n", " \n", " extracted_data = []\n", "\n", " for i, line in enumerate(lines):\n", " if i == 0:\n", " continue\n", " parts = line.split()\n", " if len(parts) >= 4:\n", " instance_number = int(parts[0])\n", " status = parts[3] == \"Solved\"\n", " extracted_data.append((instance_number, status))\n", "\n", " return [x[0] for x in extracted_data if x[1]]\n", "\n", "# Path to the input file\n", "file_path = logs_path/'status.txt'\n", "file_path\n", "\n", "# Extract the solution instance number and Status\n", "solved_problems = extract_solution_instance_status(file_path)\n", "solved_problems" ] }, { "cell_type": "code", "execution_count": null, "id": "0d721badf71c5a6669badeab3acf97dd-1", "metadata": {}, "outputs": [], "source": [ "problem_objectives_formulations_and_labels = []\n", "for log_path in real_logs:\n", " if (state_6_path := (log_path/\"state_6_code.json\")).exists():\n", " if int(log_path.parent.name) not in solved_problems:\n", " continue\n", " labels = json.loads((log_path.parent/\"labels.json\").read_text())\n", " description = (log_path.parent/\"desc.txt\").read_text()\n", " data = json.loads(state_6_path.read_text())\n", " objective_description = data[\"objective\"][\"description\"]\n", " objective_formulation = data[\"objective\"][\"formulation\"]\n", " constraints = []\n", " for constraint in data[\"constraints\"]:\n", " constraints.append({\"description\": constraint[\"description\"], \"formulation\": constraint[\"formulation\"]})\n", " problem_objectives_formulations_and_labels.append({\n", " \"objective_description\": objective_description,\n", " \"objective_formulation\": objective_formulation,\n", " \"constraints\": constraints,\n", " \"labels\": labels,\n", " \"description\": description,\n", " \"problem_name\": log_path.parent.name\n", " })\n", "\n", "problem_objectives_formulations_and_labels[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "571b12c459bf920f6c2d04a7b97ffa8f-1", "metadata": {}, "outputs": [], "source": [ "constraint_data = []\n", "total_constraints = 0\n", "for data in problem_objectives_formulations_and_labels:\n", " total_constraints += len(data[\"constraints\"])\n", " for data_constraint in data[\"constraints\"]:\n", " constraint_data.append({\n", " \"objective_description\": data[\"objective_description\"],\n", " \"objective_formulation\": data[\"objective_formulation\"],\n", " \"constraint_description\": data_constraint[\"description\"],\n", " \"constraint_formulation\": data_constraint[\"formulation\"],\n", " \"labels\": data[\"labels\"],\n", " \"description\": data[\"description\"],\n", " \"problem_name\": data[\"problem_name\"]\n", " })\n", "\n", "constraints_df = pd.DataFrame(constraint_data)\n", "constraints_df.to_pickle(constraint_path)" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }