building-agents/llma/rag/extract_constraints_from_lo...

6.0 KiB

In [ ]:
import sys
from pathlib import Path
root_path = Path(".")
while not (root_path/".git").exists():
    root_path = root_path.absolute().parent
sys.path.append(str(root_path/"data"/"rag"))
from rag_utils import logs_path, constraint_path
In [ ]:
import re
import json
import pandas as pd

Find all log paths

In [ ]:
problem_paths = sorted(logs_path.glob("*/"), key=lambda x: int(x.name))
problem_paths[:5]

real_logs = []
skipped = []

for problem_path in problem_paths:
    gpt4o_log_path = list(problem_path.glob("run*_gpt-4o*"))
    
    if len(gpt4o_log_path) == 0:
        skipped.append(problem_path)
    elif len(gpt4o_log_path) > 1:
        newest_log_path = max(gpt4o_log_path, key=lambda p: p.stat().st_mtime)
        real_logs.append(newest_log_path)
    else:
        real_logs.append(gpt4o_log_path[0])

assert len(skipped) == 0
In [ ]:
missing_files_in_log_folder = []

for idx in range(len(real_logs)):
    #  idx = random.randint(0, len(real_logs))
    names = sorted(real_logs[idx].glob("*"), key=lambda x: int("0"+"".join(re.findall("\\d", x.name))))
    if len(names) == 0:
        missing_files_in_log_folder.append(real_logs[idx])
        continue
    print([x.name for x in names][-1])
    #  print()

missing_files_in_log_folder
In [ ]:
def extract_solution_instance_status(file_path: Path):
    lines = file_path.read_text().splitlines()
    
    extracted_data = []

    for i, line in enumerate(lines):
        if i == 0:
            continue
        parts = line.split()
        if len(parts) >= 4:
            instance_number = int(parts[0])
            status = parts[3] == "Solved"
            extracted_data.append((instance_number, status))

    return [x[0] for x in extracted_data if x[1]]

# Path to the input file
file_path = logs_path/'status.txt'
file_path

# Extract the solution instance number and Status
solved_problems = extract_solution_instance_status(file_path)
solved_problems
In [ ]:
problem_objectives_formulations_and_labels = []
for log_path in real_logs:
    if (state_6_path := (log_path/"state_6_code.json")).exists():
        if int(log_path.parent.name) not in solved_problems:
            continue
        labels = json.loads((log_path.parent/"labels.json").read_text())
        description = (log_path.parent/"desc.txt").read_text()
        data = json.loads(state_6_path.read_text())
        objective_description = data["objective"]["description"]
        objective_formulation = data["objective"]["formulation"]
        constraints = []
        for constraint in data["constraints"]:
            constraints.append({"description": constraint["description"], "formulation": constraint["formulation"]})
        problem_objectives_formulations_and_labels.append({
            "objective_description": objective_description,
            "objective_formulation": objective_formulation,
            "constraints": constraints,
            "labels": labels,
            "description": description,
            "problem_name": log_path.parent.name
        })

problem_objectives_formulations_and_labels[0]
In [ ]:
constraint_data = []
total_constraints = 0
for data in problem_objectives_formulations_and_labels:
    total_constraints += len(data["constraints"])
    for data_constraint in data["constraints"]:
        constraint_data.append({
            "objective_description": data["objective_description"],
            "objective_formulation": data["objective_formulation"],
            "constraint_description": data_constraint["description"],
            "constraint_formulation": data_constraint["formulation"],
            "labels": data["labels"],
            "description": data["description"],
            "problem_name": data["problem_name"]
        })

constraints_df = pd.DataFrame(constraint_data)
constraints_df.to_pickle(constraint_path)