math-olympiad-solver

Runtime error

File size: 21,789 Bytes

3b33e85

import gradio as gr
from huggingface_hub import login

import re

# from vllm import LLM, SamplingParams
import pandas as pd
from collections import Counter
from datasets import load_dataset, Dataset, concatenate_datasets
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import os
from typing import Dict, Any, List

# code execution
import os
import re
import signal
import subprocess
import tempfile
from contextlib import contextmanager
from typing import Tuple
from transformers import PreTrainedTokenizer, set_seed
import torch
from tqdm import tqdm
import time
from sympy import N, simplify
from sympy.parsing.latex import parse_latex
import random
from pathlib import Path
from openai import OpenAI

client = OpenAI(
    base_url="https://ji0rhe7rvh6wrfmq.us-east-1.aws.endpoints.huggingface.cloud/v1/",
    api_key=os.environ.get("HF_TOKEN"),
)


@dataclass
class Config:
    model_id: str  # SELECT MODEL
    revision: str  # SELECT REVISION

    # Append an optional system prompt to each problem
    system_prompt: str

    # Number of samples to generate per problem
    num_samples: int
    num_generations: int
    # Generation parameters
    do_sample: bool
    temperature: float
    top_p: float
    top_k: int
    max_new_tokens: int
    restart_on_fail: bool

    # Enable 4-bit quantization
    is_quantized: bool

    # Run on train or test data?
    is_submission: bool = True if os.getenv("KAGGLE_IS_COMPETITION_RERUN") else False
    validation_set: str = "kaggle-validation-set-medium"

    notebook_time_limit: int = 9 * 60 * 60 - 15 * 60  # 9 hours - 15 minute buffer

    # Debug by solving only the first problem
    debug: bool = False

    # Push solutions to the Hub
    push_to_hub: bool = False


class PythonREPL:
    def __init__(self, timeout=5):
        self.timeout = timeout

    def execute(self, query: str) -> Tuple[bool, str]:
        query = "import math\nimport numpy as np\nimport sympy as sp\n" + query
        query = query.strip().split("\n")
        if "print(" not in query[-1]:
            if "#" in query[-1]:
                query[-1] = query[-1].split("#")[0]
            query[-1] = "print(" + query[-1] + ")"
        query = "\n".join(query)

        with tempfile.TemporaryDirectory() as temp_dir:
            temp_file_path = os.path.join(temp_dir, "tmp.py")

            with open(temp_file_path, "w") as f:
                f.write(query)

            result = subprocess.run(
                ["python3", temp_file_path],
                capture_output=True,
                check=False,
                text=True,
                timeout=self.timeout,
            )

            if result.returncode == 0:
                output = result.stdout
                return True, output.strip()
            else:
                error_msg = result.stderr.strip()
                msgs = error_msg.split("\n")
                new_msgs = []
                want_next = False
                for m in msgs:
                    if "Traceback" in m:
                        new_msgs.append(m)
                    elif m == msgs[-1]:
                        new_msgs.append(m)
                    elif temp_file_path in m:
                        st = m.index('"/') + 1 if '"/' in m else 0
                        ed = m.index(temp_file_path) + 1 if temp_file_path in m else None
                        clr = m[st:ed] if not ed else m[st:]
                        m = m.replace(clr, "")
                        new_msgs.append(m)
                        want_next = True
                    elif want_next:
                        new_msgs.append(m)
                        want_next = False
                error_msg = "\n".join(new_msgs)
                return False, error_msg.strip()

    def __call__(self, query: str) -> Tuple[bool, str]:
        with ThreadPoolExecutor() as executor:
            future = executor.submit(self.execute, query)
            try:
                return future.result(timeout=self.timeout)
            except TimeoutError:
                return False, f"Timed out after {self.timeout} seconds."


def execute_completion(
    executor: PythonREPL,
    completion: str,
    return_status: bool = False,
    last_code_block: bool = False,
) -> str | Tuple[str, bool]:
    # executions = ["!" + code for code in re.findall(r"```bash(.*?)```", completion, re.DOTALL) if "!" not in code]
    executions = re.findall(r"```python(.*?)```", completion, re.DOTALL)

    if len(executions) == 0:  # directly return cot result
        return completion, False if return_status else completion
    else:
        if last_code_block:
            executions = [executions[-1]]

        # Python
        execution_outputs = []
        successes = []
        for code in executions:
            success = False

            if "subprocess" in code:
                output = "subprocess is not allowed"
                execution_outputs.append(output)
                successes.append(success)
                continue

            if "venv" in code:
                output = "venv is not allowed"
                execution_outputs.append(output)
                successes.append(success)
                continue

            try:
                success, output = executor(code)
            except TimeoutError as e:
                print("time out")
                output = e

            if not success and not return_status:
                output = ""

            execution_outputs.append(output)
            successes.append(success)

        output = str(execution_outputs[-1]).strip()
        success = successes[-1]

        if return_status:
            return output, success
        else:
            return output


def postprocess_completion(
    text: str, return_status: bool = False, last_code_block=False, timeout=5
) -> str | Tuple[str, bool]:
    executor = PythonREPL(timeout=timeout)

    result = execute_completion(executor, text, return_status=return_status, last_code_block=last_code_block)
    del executor

    return result


def apply_template(example: Dict[str, Any], prompt: str) -> Dict[str, Any]:
    return prompt.format(example["prompt"], "{}")


def last_boxed_only_string(string):
    """
    Extracts the last LaTeX boxed or framed expression from a string.
    Args:
        string (str): The input string containing LaTeX expressions.
    Returns:
        str or None: The last boxed or framed expression, if found;
        otherwise, None.
    """

    idx = string.rfind("\\boxed")
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx : right_brace_idx + 1]

    return retval


def remove_boxed(s):
    """
    Removes the LaTeX boxed command, returning the content inside the braces.
    Args:
        s (str): The string containing a LaTeX boxed expression.
    Returns:
        str or None: The content inside the boxed command, if valid;
        otherwise, None.
    """

    left = "\\boxed{"
    try:
        assert s[: len(left)] == left
        assert s[-1] == "}"
        length = len(left)
        return s[length:-1]
    except Exception:
        return None


def extract_boxed_answer(pred_str, strip_double_curly_brace=False):
    """
    Extracts the answer from a LaTeX boxed expression within
    a prediction string.
    Args:
        pred_str (str): The string containing one or more LaTeX
        boxed expressions.
        strip_double_curly_brace (bool): If True, removes an additional
        layer of braces.
    Returns:
        str or None: The extracted answer, if any; otherwise, None.
    """

    boxed_str = last_boxed_only_string(pred_str)
    if boxed_str is None:
        return None
    answer = remove_boxed(boxed_str)
    if answer is None:
        return None
    if strip_double_curly_brace:
        match = re.match("^\{(.*)\}$", answer)  # noqa: W605
        if match:
            answer = match.group(1)
    return answer


def normalize_final_answer(final_answer: str) -> str:
    """
    Normalizes a final answer string by removing or replacing various LaTeX
    and text elements.
    Args:
        final_answer (str): The answer string to normalize.
    Returns:
        str: The normalized answer string.
    """

    match = re.search(r"(.*?)Problem:", final_answer, flags=re.S)
    if match:
        final_answer = match.group(1)  # 返回匹配的第一部分，即"Problem"之前的所有文本
    """Normalize a final answer to a quantitative reasoning question."""
    # final_answer = final_answer.split('=')[-1]
    SUBSTITUTIONS = [
        ("an ", ""),
        ("a ", ""),
        (".$", "$"),
        ("\\$", ""),
        (r"\ ", ""),
        (" ", ""),
        ("mbox", "text"),
        (",\\text{and}", ","),
        ("\\text{and}", ","),
        ("\\text{m}", "\\text{}"),
        ("\\le", "<"),
    ]
    REMOVED_EXPRESSIONS = [
        "square",
        "ways",
        "integers",
        "dollars",
        "mph",
        "inches",
        "ft",
        "hours",
        "km",
        "units",
        "\\ldots",
        "sue",
        "points",
        "feet",
        "minutes",
        "digits",
        "cents",
        "degrees",
        "cm",
        "gm",
        "pounds",
        "meters",
        "meals",
        "edges",
        "students",
        "childrentickets",
        "multiples",
        "\\text{s}",
        "\\text{.}",
        "\\text{\ns}",
        "\\text{}^2",
        "\\text{}^3",
        "\\text{\n}",
        "\\text{}",
        r"\mathrm{th}",
        r"^\circ",
        r"^{\circ}",
        r"\;",
        r",\!",
        "{,}",
        '"',
        "\\dots",
        "\n",
        "\r",
        "\f",
        "\%",
    ]
    for before, after in SUBSTITUTIONS:
        final_answer = final_answer.replace(before, after)
    for expr in REMOVED_EXPRESSIONS:
        final_answer = final_answer.replace(expr, "")

    # Extract answer that is in LaTeX math, is bold,
    # is surrounded by a box, etc.
    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
    assert "\n" not in final_answer
    assert "\r" not in final_answer
    assert "\f" not in final_answer
    if len(re.findall(r"finalansweris(.*)", final_answer)) > 0:
        final_answer = re.findall(r"finalansweris(.*)", final_answer)[-1]

    if len(re.findall(r"answer?is:?(.*)", final_answer)) > 0:
        final_answer = re.findall(r"answer?is:?(.*)", final_answer)[-1]

    if len(re.findall(r"oxed\{(.*?)\}", final_answer)) > 0:
        final_answer = re.findall(r"oxed\{(.*?)\}", final_answer)[-1]

    if len(re.findall(r"\$(.*?)\$", final_answer)) > 0:
        final_answer = re.findall(r"\$(.*?)\$", final_answer)[-1]
    final_answer = final_answer.strip()
    if "rac" in final_answer and "\\frac" not in final_answer:
        final_answer = final_answer.replace("rac", "\\frac")

    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
    final_answer = final_answer.replace("$", "")

    if final_answer.replace(",", "").isdigit():
        final_answer = final_answer.replace(",", "")

    return final_answer


def naive_parse(answer: str) -> str:
    """
    Extracts and returns the numeric digits from the input string, processing them in reverse order
    until a non-numeric character is encountered after encountering the first numeric character.

    Args:
        answer (str): The input string to parse.

    Returns:
        str: A string consisting of the numeric digits extracted from the input, in their original order.

    Example:
        >>> naive_parse("abc123def")
        '123'
        >>> naive_parse("def456ghi")
        '456'
        >>> naive_parse("no numbers here")
        ''
    """
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in "0123456789" and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True

    out = reversed(out)
    return "".join(out)


def validate_answer_is_numeric(x: str | int | float) -> int:
    FLOAT_TOLERANCE = 0.2
    try:
        x = round(float(x))
        f = float(x)
        if abs(x - f) > FLOAT_TOLERANCE:
            x = -1
    except Exception:
        x = -1
    return x


def get_majority_vote(responses: List[int]) -> int:
    if len(responses) < 1:
        return 0
    else:
        c = Counter(responses)
        value, count = c.most_common()[0]
        return value


def filter_answers(answers: List[str]) -> List[int]:
    formatted_answers = [validate_answer_is_numeric(a) for a in answers]

    # Filter for non-negative answers
    formatted_answers = [a for a in formatted_answers if a >= 0]
    # Compute modulo
    formatted_answers = [a % 1_000 for a in formatted_answers]
    # less than 2.1 billion or cannot convert to C int (32-bit)
    formatted_answers = [a for a in formatted_answers if a <= 999]
    return formatted_answers


def check_sympy_equivalence(ref_answer: str, model_answer: str) -> bool:
    def do_answers_match(ref_answer: str, model_answer: str) -> bool:
        ref_sympy = parse_latex(ref_answer)
        model_sympy = parse_latex(model_answer)
        diff = simplify(ref_sympy - model_sympy)
        return True if -1e-12 < N(diff) < 1e-12 or diff.is_zero else False

    try:
        result = do_answers_match(ref_answer, model_answer)
        return result
    except Exception as e:
        print(e)
        return False


def check_string_match(ref_answer: str, model_answer: str) -> bool:
    try:
        return ref_answer == model_answer
    except Exception as e:
        print(e)
    return False


def check_answer(ref_answer: str, model_answer: str) -> bool:
    # check if strings are the same
    correct = check_string_match(ref_answer, model_answer)
    if correct:
        return True

    # use the sympy library to check if the expressions are the same
    correct = check_sympy_equivalence(ref_answer, model_answer)
    if correct:
        return True

    return False


debug = False
model_id = "Numina-Math-7B"
revision = "main"
system_prompt = "{}"
validation_set = "kaggle-validation-set-medium"
is_submission = True
num_samples = 4
num_generations = 4
temperature = 0.8
is_quantized = False
restart_on_fail = False
top_p = 1.0
top_k = 0
max_new_tokens = 2048
# Papermill related variables
push_to_hub = False
notebook_name = ""

config = Config(
    debug=debug,
    push_to_hub=push_to_hub,
    model_id=model_id,
    revision=revision,
    system_prompt=system_prompt,
    validation_set=validation_set,
    is_quantized=is_quantized,
    restart_on_fail=restart_on_fail,
    is_submission=is_submission,
    num_samples=num_samples,
    num_generations=num_generations,
    do_sample=True,
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    max_new_tokens=max_new_tokens,
)
print(f"=== Running submission with config ===\n\n{config}")


def generate(message):
    chat_completion = client.chat.completions.create(
        model="tgi",
        messages=message,
        stream=True,
        max_tokens=1024,
        stop=["```output\n"],
        temperature=temperature,
    )

    for message in chat_completion:
        yield message.choices[0].delta.content


def get_majority_text(data):
    from collections import Counter

    # Count the frequency of each answer in model_answers
    answer_counts = Counter(data["model_answers"])

    # Find the majority response
    majority_response = answer_counts.most_common(1)[0][0]

    # Find the index of the first occurrence of the majority response
    majority_index = data["model_answers"].index(majority_response)

    # Return the corresponding text in gen_texts
    return data["gen_texts"][majority_index]


def extract_solution(text):
    # Split the text at "### Solution:"
    parts = text.split("### Solution:", 1)
    if len(parts) > 1:
        # Return everything after "### Solution:"
        return parts[1].strip()
    else:
        # Return an empty string if "### Solution:" is not found
        return ""


def process_code(
    example: Dict[str, Any],
    config: Config,
    restart_on_fail: bool = False,
    last_step: bool = False,
) -> Dict[str, Any]:
    gen_text = example["gen_texts"]
    num_python_blocks = len(re.findall(r"```python(.*?)```", gen_text, re.DOTALL))

    if num_python_blocks == 0:
        if restart_on_fail:
            print("no code has ever been generated, RESTARTING")
            # reset the text to the original
            example["gen_texts"] = example["text"]
        else:
            print("no code has ever been generated, STOP")
            example["should_prune"] = True
            example["has_code"] = False
        return example

    if gen_text[-10:] != "```output\n" and ("answer is" in gen_text[-100:] or "\\boxed" in gen_text[-100:]):
        num_output_blocks = len(re.findall(r"```output(.*?)```", gen_text, re.DOTALL))
        if num_output_blocks == 0:
            print("the model hallucinated the code answer")
            example["should_prune"] = True
            return example

        if "boxed" in gen_text[-100:]:
            try:
                answer = normalize_final_answer(extract_boxed_answer(gen_text[-100:]))
            except Exception:
                answer = "-1"
        else:
            answer = normalize_final_answer(gen_text[-100:])

        example["model_answers"] = answer
        if not config.is_submission:
            example["corrects"] = check_answer(example["ground_truth"], answer)
        example["should_prune"] = True
        print("Answer is: ", answer, example["ground_truth"], example["corrects"])
        return example

    if last_step:
        # no point in continuing if we are at the last step
        return example

    if gen_text[-10:] != "```output\n":
        # something else has gone wrong with the generation
        print("warning: output block not found: ", gen_text[-40:])
        if restart_on_fail:
            example["gen_texts"] = example["text"]
        else:
            example["should_prune"] = True
        return example

    code_result, status = postprocess_completion(gen_text, return_status=True, last_code_block=True)
    # add the code result for the next round of generation
    TRUNCATION_LIMIT = 200
    if len(code_result) > TRUNCATION_LIMIT:
        code_result = code_result[:TRUNCATION_LIMIT] + " ... (output truncated)"
    example["gen_texts"] = gen_text + f"{code_result}\n```"

    return example


# load the vllm instance and set sampling parameters
# vllm = build_vllm(config)


def solve_problem(problem, temperature, progress=gr.Progress()):
    problem = apply_template({"prompt": problem}, prompt=config.system_prompt)
    print(f"Problem: {problem}")

    sample = {
        "problem": problem,  # not used for the submission TODO Remove
        "ground_truth": "unknown",  # not used for the submission TODO Remove
        "text": "### Solution:\n",
        "gen_texts": "### Solution:\n",  # used to store all the generated text
        "should_prune": False,
        "problem_index": -1,  # not used for the submission TODO Remove
        "model_answers": "-1",
        "has_code": True,
        "corrects": False,  # not used for the submission TODO Remove
    }

    for step in progress.tqdm(
        range(config.num_generations), desc="Generating candidates"
    ):  # Depth of the tree (e.g. 6 steps = 5 code blocks)

        step_reponse = sample["gen_texts"]

        messages = [
            {"role": "user", "content": sample["problem"]},
            {"role": "assistant", "content": sample["gen_texts"]},
        ]

        for reponse_message in generate(messages, temperature):
            if reponse_message is not None:
                step_reponse += reponse_message
                yield step_reponse

        sample["gen_texts"] = step_reponse

        # TODO: Maybe it should just return the result of running the code
        sample = process_code(
            sample,
            config=config,
            restart_on_fail=config.restart_on_fail,
            last_step=(step == (config.num_generations - 1)),
        )
        sample["gen_texts"] = sample["gen_texts"] + "\n"

        run_code_reponse = sample["gen_texts"].replace(step_reponse, "")

        for output_mseeage in run_code_reponse:
            if output_mseeage is not None:
                step_reponse += output_mseeage
                yield step_reponse

        if sample["should_prune"]:
            break

    yield sample["gen_texts"]


with gr.Blocks() as demo:
    with gr.Row():
        inp = gr.Textbox(placeholder="Problem", label="Problem", lines=5)
    with gr.Accordion("Advanced Options", open=False):
        temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, label="Temperature")
    with gr.Row():
        out = gr.Markdown()

    btn = gr.Button("Run")
    btn.click(fn=solve_problem, inputs=[inp, temperature], outputs=out)


if __name__ == "__main__":
    demo.queue(default_concurrency_limit=5).launch()