SwiftSage

Sleeping

App Files Files Community

yuchenlin commited on Sep 24

Commit

1a0cf07

•

1 Parent(s): 55ce5e3

Upload 14 files

Browse files

Files changed (14) hide show

README.md +9 -13
app.py +97 -0
code_executor.py +111 -0
data_loader.py +78 -0
data_utils.py +358 -0
evaluate.py +71 -0
grader.py +305 -0
main.py +468 -0
prompt_templates/reward_template.md +44 -0
prompt_templates/sage_template.md +45 -0
prompt_templates/swift_template.md +152 -0
run_eval.sh +6 -0
test.py +28 -0
utils.py +260 -0

README.md CHANGED Viewed

@@ -1,13 +1,9 @@
----
-title: SwiftSage
-emoji: 🔥
-colorFrom: purple
-colorTo: pink
-sdk: gradio
-sdk_version: 4.44.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## 🤖 SwiftSage (v2):
+> [!IMPORTANT]
+> The code of SwiftSage v1 (for the experiments in NeurIPS 2023) is archived in the [`science_world`](https://github.com/SwiftSage/SwiftSage/tree/science_world) branch.
+<!-- Github Readme Important Callout box note -->

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import gradio as gr
+import os
+import json
+import logging
+import numpy as np
+from utils import (PromptTemplate, api_configs, setup_logging)
+from data_loader import load_data
+from evaluate import evaluate
+from main import SwiftSage, run_test, run_benchmark
+import multiprocessing
+def solve_problem(problem, max_iterations, reward_threshold, swift_model_id, sage_model_id, reward_model_id, use_retrieval, start_with_sage):
+    # Configuration for each LLM
+    max_iterations = int(max_iterations)
+    reward_threshold = int(reward_threshold)
+    swift_config = {
+        "model_id": swift_model_id,
+        "api_config": api_configs['Together']
+    }
+    reward_config = {
+        "model_id": reward_model_id,
+        "api_config": api_configs['Together']
+    }
+    sage_config = {
+        "model_id": sage_model_id,
+        "api_config": api_configs['Together']
+    }
+    # specify the path to the prompt templates
+    prompt_template_dir = './prompt_templates'
+    dataset = []
+    embeddings = [] # TODO: for retrieval augmentation (not implemented yet now)
+    s2 = SwiftSage(
+        dataset,
+        embeddings,
+        prompt_template_dir,
+        swift_config,
+        sage_config,
+        reward_config,
+        use_retrieval=use_retrieval,
+        start_with_sage=start_with_sage,
+    )
+    reasoning, solution = s2.solve(problem, max_iterations, reward_threshold)
+    solution = solution.replace("Answer (from running the code):\n ", " ")
+    return reasoning, solution
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    # gr.Markdown("## SwiftSage: A Multi-Agent Framework for Reasoning")
+    # use the html and center the title
+    gr.HTML("<h1 style='text-align: center;'>SwiftSage: A Multi-Agent Framework for Reasoning</h1>")
+    with gr.Row():
+        swift_model_id = gr.Textbox(label="😄 Swift Model ID", value="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo")
+        reward_model_id = gr.Textbox(label="🤔 Feedback Model ID", value="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo")
+        sage_model_id = gr.Textbox(label="😎 Sage Model ID", value="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo")
+        # the following two should have a smaller width
+    with gr.Accordion(label="⚙️ Advanced Options", open=False):
+        with gr.Row():
+            with gr.Column():
+                max_iterations = gr.Textbox(label="Max Iterations", value="5")
+                reward_threshold = gr.Textbox(label="Reward Threshold", value="8")
+            # TODO: add top-p and temperature for each module for controlling
+            with gr.Column():
+                top_p_swift = gr.Textbox(label="Top-p for Swift", value="0.9")
+                temperature_swift = gr.Textbox(label="Temperature for Swift", value="0.7")
+            with gr.Column():
+                top_p_sage = gr.Textbox(label="Top-p for Sage", value="0.9")
+                temperature_sage = gr.Textbox(label="Temperature for Sage", value="0.7")
+            with gr.Column():
+                top_p_reward = gr.Textbox(label="Top-p for Feedback", value="0.9")
+                temperature_reward = gr.Textbox(label="Temperature for Feedback", value="0.7")
+            use_retrieval = gr.Checkbox(label="Use Retrieval Augmentation", value=False, visible=False)
+            start_with_sage = gr.Checkbox(label="Start with Sage", value=False, visible=False)
+    problem = gr.Textbox(label="Input your problem", value="How many letter r are there in the sentence 'My strawberry is so ridiculously red.'?", lines=2)
+    solve_button = gr.Button("🚀 Solve Problem")
+    reasoning_output = gr.Textbox(label="Reasoning steps with Code", interactive=False)
+    solution_output = gr.Textbox(label="Final answer", interactive=False)
+    solve_button.click(
+        solve_problem,
+        inputs=[problem, max_iterations, reward_threshold, swift_model_id, sage_model_id, reward_model_id, use_retrieval, start_with_sage],
+        outputs=[reasoning_output, solution_output]
+    )
+if __name__ == '__main__':
+    multiprocessing.set_start_method('spawn')
+    demo.launch(share=False, show_api=False)

code_executor.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Source and credits: https://github.com/ZubinGou/math-evaluation-harness/blob/main/python_executor.py
+We modified it to be more simple.
+"""
+import io
+import pickle
+import traceback
+from concurrent.futures import ProcessPoolExecutor, TimeoutError
+from contextlib import redirect_stdout
+class GenericRuntime:
+    GLOBAL_DICT = {}
+    LOCAL_DICT = None
+    HEADERS = []
+    def __init__(self):
+        self._global_vars = self.GLOBAL_DICT.copy()
+        self._local_vars = self.LOCAL_DICT.copy() if self.LOCAL_DICT else None
+        for c in self.HEADERS:
+            self.exec_code(c)
+    def exec_code(self, code_piece: str) -> None:
+        exec(code_piece, self._global_vars)
+    def eval_code(self, expr: str) -> any:
+        return eval(expr, self._global_vars)
+    def inject(self, var_dict):
+        self._global_vars.update(var_dict)
+    @property
+    def answer(self):
+        return self._global_vars['answer']
+class PythonExecutor:
+    def __init__(
+        self,
+        runtime=None,
+        get_answer_symbol=None,
+        get_answer_expr=None,
+        get_answer_from_stdout=False,
+        timeout_length=5,
+    ):
+        self.runtime = runtime if runtime else GenericRuntime()
+        self.answer_symbol = get_answer_symbol
+        self.get_answer_expr = get_answer_expr
+        self.get_answer_from_stdout = get_answer_from_stdout
+        self.timeout_length = timeout_length
+    def execute(self, code):
+        try:
+            if self.get_answer_from_stdout:
+                program_io = io.StringIO()
+                with redirect_stdout(program_io):
+                    self.runtime.exec_code('\n'.join(code))
+                program_io.seek(0)
+                result = program_io.read()
+            elif self.answer_symbol:
+                self.runtime.exec_code('\n'.join(code))
+                result = self.runtime._global_vars[self.answer_symbol]
+            elif self.get_answer_expr:
+                self.runtime.exec_code('\n'.join(code))
+                result = self.runtime.eval_code(self.get_answer_expr)
+            else:
+                self.runtime.exec_code('\n'.join(code[:-1]))
+                result = self.runtime.eval_code(code[-1])
+            report = "Done"
+            pickle.dumps(result)  # Serialization check
+        except Exception as e:
+            result = ''
+            report = str(e)
+        return result, report
+    def apply(self, code):
+        code_snippet = code.split('\n')
+        # Use ProcessPoolExecutor to enforce timeout
+        with ProcessPoolExecutor() as executor:
+            future = executor.submit(self.execute, code_snippet)
+            try:
+                result, report = future.result(timeout=self.timeout_length)
+            except TimeoutError:
+                result, report = "", "Timeout Error"
+        return result.strip(), report.strip()
+# Example usage
+if __name__ == "__main__":
+    executor = PythonExecutor(get_answer_from_stdout=True)
+    code = """
+from sympy import Matrix
+def null_space_basis():
+    A = Matrix([[3, 3, -1, -6], [9, -1, -8, -1], [7, 4, -2, -9]])
+    basis = A.nullspace()
+    return [v.evalf(3) for v in basis]
+result = null_space_basis()
+print(result)
+"""
+    result, report = executor.apply(code)
+    print("Result:", result)
+    print("Report:", report)

data_loader.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import json
+import os
+import re
+import random
+from typing import Any, Iterable, Union
+from datasets import Dataset, concatenate_datasets, load_dataset
+from data_utils import (
+    lower_keys,
+    parse_question,
+    parse_ground_truth,
+)
+def load_jsonl(file):
+    with open(file, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                yield json.loads(line)
+            except:
+                print("Error in loading:", line)
+                exit()
+def load_data(
+        data_name,
+        split='test',
+        data_dir='./data',
+        num_test_sample=-1,
+    ):
+    if data_name.lower() == "math":
+        data_name = 'MATH'  # we use 500 problem test split in "Let's Verify Step-by-Step"
+    data_file = f"{data_dir}/{data_name}/{split}.jsonl"
+    if os.path.exists(data_file):
+        examples = list(load_jsonl(data_file))
+    else:
+        if data_name == "mmlu_stem":
+            dataset = load_dataset("hails/mmlu_no_train", 'all', split='test')
+            # only keep stem subjects
+            stem_subjects = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry',
+                'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security',
+                'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology',
+                'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics',
+                'high_school_physics', 'high_school_statistics', 'machine_learning']
+            dataset = dataset.rename_column("subject", "type")
+            dataset = dataset.filter(lambda x: x['type'] in stem_subjects)
+        elif data_name == "mathvista":
+            raise NotImplementedError(data_name)
+        elif data_name == "gpqa":
+            dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
+        elif data_name == "codeforces":
+            raise NotImplementedError(data_name)
+        else:
+            raise NotImplementedError(data_name)
+        examples = list(dataset)
+        examples = [lower_keys(example) for example in examples]
+        dataset = Dataset.from_list(examples)
+        os.makedirs(f"{data_dir}/{data_name}", exist_ok=True)
+        dataset.to_json(data_file)
+    # add 'idx' in the first column
+    if 'idx' not in examples[0]:
+        examples = [{'idx': i, **example} for i, example in enumerate(examples)]
+    # dedepulicate & sort
+    examples = sorted(examples, key=lambda x: x['idx'])
+    if num_test_sample > 0:
+        examples = examples[:num_test_sample]
+    return examples
+if __name__ == "__main__":
+    examples = load_data("gpqa", "test")
+    print('test')

data_utils.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""
+Source and credits: https://github.com/ZubinGou/math-evaluation-harness/blob/main/python_executor.py
+"""
+import re
+import regex
+import sympy
+from typing import TypeVar, Iterable, List, Union, Any, Dict
+from word2number import w2n
+from utils import *
+def lower_keys(example):
+    new_example = {}
+    for key, value in example.items():
+        if key != key.lower():
+            new_key = key.lower()
+            new_example[new_key] = value
+        else:
+            new_example[key] = value
+    return new_example
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) > 0 and substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        if "sqrt" not in a:
+            a = int(a)
+        if "sqrt" not in b:
+            b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+def _fix_sqrt(string):
+    _string = re.sub(r"\\sqrt(\w+)", r"\\sqrt{\1}", string)
+    return _string
+def convert_word_number(text:str) -> str:
+    try:
+        text = str(w2n.word_to_num(text))
+    except:
+        pass
+    return text
+# units mainly from MathQA
+unit_texts = [
+    "east", "degree", "mph", "kmph", "ft", "m sqaure", " m east", "sq m", "deg", "mile",
+    "q .", "monkey", "prime", "ratio", "profit of rs",  "rd", "o", "gm",
+    "p . m", "lb", "tile", "per", "dm", "lt", "gain", "ab", "way", "west",
+    "a .", "b .", "c .", "d .", "e .", "f .", "g .", "h .", "t", "a", "h",
+    "no change", "men", "soldier", "pie", "bc", "excess", "st",
+    "inches", "noon", "percent", "by", "gal", "kmh", "c", "acre", "rise",
+    "a . m", "th", "π r 2", "sq", "mark", "l", "toy", "coin",
+    "sq . m", "gallon", "° f", "profit", "minw", "yr", "women",
+    "feet", "am", "pm", "hr", "cu cm", "square", "v â € ™", "are",
+    "rupee", "rounds", "cubic", "cc", "mtr", "s", "ohm", "number",
+    "kmph", "day", "hour", "minute", "min", "second", "man", "woman",
+    "sec", "cube", "mt", "sq inch", "mp", "∏ cm ³", "hectare", "more",
+    "sec", "unit", "cu . m", "cm 2", "rs .", "rs", "kg", "g", "month",
+    "km", "m", "cm", "mm", "apple", "liter", "loss", "yard",
+    "pure", "year", "increase", "decrease", "d", "less", "Surface",
+    "litre", "pi sq m", "s .", "metre", "meter", "inch",
+]
+unit_texts.extend([t + "s" for t in unit_texts])
+def strip_string(string):
+    string = str(string).strip()
+    # linebreaks
+    string = string.replace("\n", "")
+    # right "."
+    string = string.rstrip(".")
+    # remove inverse spaces
+    # replace \\ with \
+    string = string.replace("\\!", "")
+    # string = string.replace("\\ ", "")
+    # string = string.replace("\\\\", "\\")
+    # matrix
+    string = re.sub(r'\\begin\{array\}\{.*?\}', r'\\begin{pmatrix}', string)
+    string = re.sub(r'\\end\{array\}', r'\\end{pmatrix}', string)
+    string = string.replace("bmatrix", "pmatrix")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    string = string.replace("\\{", "{")
+    string = string.replace("\\}", "}")
+    # Remove unit: miles, dollars if after is not none
+    _string = re.sub(r"\\text{.*?}$", "", string).strip()
+    if _string != "" and _string != string:
+        # print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
+        string = _string
+    # Remove unit: texts
+    for _ in range(2):
+        for unit_text in unit_texts:
+            # use regex, the prefix should be either the start of the string or a non-alphanumeric character
+            # the suffix should be either the end of the string or a non-alphanumeric character
+            _string = re.sub(r"(^|\W)" + unit_text + r"($|\W)", r"\1\2", string)
+            if _string != "":
+                string = _string
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    string = string.replace("$", "")
+    # convert word number to digit
+    string = convert_word_number(string)
+    # replace "\\text{...}" to "..."
+    string = re.sub(r"\\text\{(.*?)\}", r"\1", string)
+    for key in ['x=', 'y=', 'z=', 'x\\in', 'y\\in', 'z\\in', 'x\\to', 'y\\to', 'z\\to']:
+        string = string.replace(key, "")
+    string = string.replace("\\emptyset", r"{}")
+    string = string.replace("(-\\infty,\\infty)", "\\mathbb{R}")
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")
+    string = string.replace("%", "")
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # cdot
+    # string = string.replace("\\cdot", "")
+    if string.startswith("{") and string.endswith("}") and string.isalnum() or \
+        string.startswith("(") and string.endswith(")") and string.isalnum() or \
+        string.startswith("[") and string.endswith("]") and string.isalnum():
+        string = string[1:-1]
+    # inf
+    string = string.replace("infinity", "\\infty")
+    if "\\infty" not in string:
+        string = string.replace("inf", "\\infty")
+    string = string.replace("+\\inity", "\\infty")
+    # and
+    string = string.replace("and", "")
+    string = string.replace("\\mathbf", "")
+    # use regex to remove \mbox{...}
+    string = re.sub(r"\\mbox{.*?}", "", string)
+    # quote
+    string.replace("'", "")
+    string.replace("\"", "")
+    # i, j
+    if "j" in string and "i" not in string:
+        string = string.replace("j", "i")
+    # replace a.000b where b is not number or b is end, with ab, use regex
+    string = re.sub(r"(\d+)\.0*([^\d])", r"\1\2", string)
+    string = re.sub(r"(\d+)\.0*$", r"\1", string)
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    string = _fix_sqrt(string)
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+    return string
+def extract_multi_choice_answer(pred_str):
+    # TODO: SFT models
+    if 'Problem:' in pred_str:
+        pred_str = pred_str.split("Problem:", 1)[0]
+    pred_str = pred_str.replace("choice is", "answer is")
+    patt = regex.search(r"answer is \(?(?P<ans>[abcde])\)?", pred_str.lower())
+    if patt is not None:
+        return patt.group('ans').upper()
+    return 'placeholder'
+def extract_answer(pred_str, data_name):
+    if data_name in ["mmlu_stem", "sat_math", "mathqa"]:
+        return extract_multi_choice_answer(pred_str)
+    if 'final answer is $' in pred_str and '$. I hope' in pred_str:
+        # minerva_math
+        tmp = pred_str.split('final answer is $', 1)[1]
+        pred = tmp.split('$. I hope', 1)[0].strip()
+    elif 'boxed' in pred_str:
+        ans = pred_str.split('boxed')[-1]
+        if len(ans) == 0:
+            return ""
+        elif ans[0] == '{':
+            stack = 1
+            a = ''
+            for c in ans[1:]:
+                if (c == '{'):
+                    stack += 1
+                    a += c
+                elif (c == '}'):
+                    stack -= 1
+                    if (stack == 0): break
+                    a += c
+                else:
+                    a += c
+        else:
+            a = ans.split('$')[0].strip()
+        pred = a
+    elif ('he answer is' in pred_str):
+        pred = pred_str.split('he answer is')[-1].strip()
+    elif ('final answer is' in pred_str):
+        pred = pred_str.split('final answer is')[-1].strip()
+    # elif extract_program_output(pred_str) != "":
+        # fall back to program
+        # pred = extract_program_output(pred_str)
+    else: # use the last number
+        pattern = '-?\d*\.?\d+'
+        pred = re.findall(pattern, pred_str.replace(",", ""))
+        if(len(pred) >= 1):
+            pred = pred[-1]
+        else: pred = ''
+    # multiple line
+    # pred = pred.split("\n")[0]
+    pred = re.sub(r"\n\s*", "", pred)
+    if pred != "" and pred[0] == ":":
+        pred = pred[1:]
+    if pred != "" and pred[-1] == ".":
+        pred = pred[:-1]
+    if pred != "" and pred[-1] == "/":
+        pred = pred[:-1]
+    pred = strip_string(pred)
+    return pred
+def parse_ground_truth(example: Dict[str, Any], data_name):
+    # parse ground truth
+    if data_name in ["MATH", "math", "math_oai", "minerva_math", "ocw", "amps", "hungarian_exam"]:
+        gt_ans = example['answer']
+    elif data_name == "gsm8k":
+        gt_ans = example['answer'].split("####")[-1]
+    elif data_name == "mmlu_stem":
+        abcd = 'ABCD'
+        gt_ans = abcd[example['answer']]
+    elif data_name == "gpqa":
+        gt_ans = example['correct answer']
+    else:
+        raise NotImplementedError(f"`{data_name}`")
+    # post process
+    gt_ans = strip_string(gt_ans)
+    return gt_ans
+def parse_question(example, data_name):
+    question = ""
+    if data_name == "mmlu_stem":
+        options = example['choices']
+        assert len(options) == 4
+        for i, (label, option) in enumerate(zip('ABCD', options)):
+            options[i] = f"({label}) {str(option).strip()}"
+        options = ", ".join(options)
+        question = f"{example['question'].strip()}\nWhat of the following is the right choice? Explain your answer.\n{options}"
+    else:
+        for key in ['question', 'problem', 'Question', 'input']:
+            if key in example:
+                question = example[key]
+                break
+    assert question != ""
+    # Yes or No question
+    gt_ans = parse_ground_truth(example, data_name)
+    gt_lower = gt_ans.lower()
+    if gt_lower in ["true", "false"]:
+        question += " (True or False)"
+    if gt_lower in ["yes", "no"]:
+        question += " (Yes or No)"
+    return question.strip()
+def _test_extract_answer():
+    text= """
+    The answer is $\\boxed{\left(
+\\begin{array}{ccc}
+ -13 & 4 & -2 \\\\
+ 7 & 8 & -3 \\\\
+ 0 & 18 & -7 \\\\
+ 6 & 12 & 5 \\\\
+\\end{array}
+\\right)}$.
+"""
+    print(extract_answer(text, "math"))
+    # should output a dict
+if __name__ == "__main__":
+    _test_extract_answer()

evaluate.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Source and credits: https://github.com/ZubinGou/math-evaluation-harness/blob/main/python_executor.py
+"""
+import argparse
+import json
+from concurrent.futures import TimeoutError
+import numpy as np
+from pebble import ProcessPool
+from tqdm import tqdm
+from grader import math_equal_process
+def evaluate(samples: list=None, file_path: str=None):
+    assert samples or file_path, "samples or file_path must be provided"
+    if not samples:
+        with open(file_path, 'r') as f:
+            samples = [json.loads(line) for line in f]
+    # dedup by idx
+    if 'idx' in samples[0]:
+        samples = {sample['idx']: sample for sample in samples}.values()
+        samples = sorted(samples, key=lambda x: x['idx'])
+    else:
+        samples = [dict(idx=idx, **sample) for idx, sample in enumerate(samples)]
+    params = [(idx, sample['pred'], sample['gt']) for idx, sample in enumerate(samples)]
+    scores = []
+    timeout_cnt = 0
+    with ProcessPool() as pool:
+        future = pool.map(math_equal_process, params, timeout=3)
+        iterator = future.result()
+        with tqdm(total=len(samples), desc="Evaluate") as progress_bar:
+            while True:
+                try:
+                    result = next(iterator)
+                    scores.append(result)
+                except StopIteration:
+                    break
+                except TimeoutError as error:
+                    print(error)
+                    scores.append(False)
+                    timeout_cnt += 1
+                except Exception as error:
+                    print(error.traceback)
+                    exit()
+                progress_bar.update(1)
+    assert len(samples) == len(scores)
+    for i in range(len(samples)):
+        samples[i]['score'] = scores[i]
+    mean_score = np.round(np.mean([score for score in scores if score is not False]), decimals=2)
+    result_json = {
+        "num_samples": len(samples),
+        "num_scores": len(scores),
+        "timeout_samples": timeout_cnt,
+        "acc": mean_score
+    }
+    return samples, result_json
+if __name__ == "__main__":
+    samples, results_json = evaluate(file_path="output/MATH.jsonl")
+    print('test')

grader.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+Source and credits: https://github.com/ZubinGou/math-evaluation-harness/blob/main/python_executor.py
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
+- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
+- https://github.com/openai/prm800k
+- https://github.com/microsoft/ToRA/blob/main/src/eval/grader.py
+- https://github.com/deepseek-ai/DeepSeek-Math/blob/main/evaluation/eval/eval_utils.py
+"""
+import re
+import regex
+import multiprocessing
+from math import isclose
+from typing import Union
+from sympy import simplify, N
+from sympy.parsing.sympy_parser import parse_expr
+from sympy.parsing.latex import parse_latex
+from latex2sympy2 import latex2sympy
+def parse_digits(num):
+    num = regex.sub(',', '', str(num))
+    try:
+        return float(num)
+    except:
+        if num.endswith('%'):
+            num = num[:-1]
+            if num.endswith('\\'):
+                num = num[:-1]
+            try:
+                return float(num) / 100
+            except:
+                pass
+    return None
+def is_digit(num):
+    # paired with parse_digits
+    return parse_digits(num) is not None
+def str_to_pmatrix(input_str):
+    input_str = input_str.strip()
+    matrix_str = re.findall(r'\{.*,.*\}', input_str)
+    pmatrix_list = []
+    for m in matrix_str:
+        m = m.strip('{}')
+        pmatrix = r'\begin{pmatrix}' + m.replace(',', '\\') + r'\end{pmatrix}'
+        pmatrix_list.append(pmatrix)
+    return ', '.join(pmatrix_list)
+def math_equal(prediction: Union[bool, float, str],
+                reference: Union[float, str],
+                include_percentage: bool = True,
+                is_close: bool = True,
+                timeout: bool = False,
+                ) -> bool:
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    # print("Judge:", prediction, reference)
+    if str(prediction) == str(reference):
+        return True
+    try: # 1. numerical equal
+        if is_digit(prediction) and is_digit(reference):
+            prediction = parse_digits(prediction)
+            reference = parse_digits(reference)
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if is_close:
+                        if numeric_equal(prediction, item):
+                            return True
+                    else:
+                        if item == prediction:
+                            return True
+                except Exception:
+                    continue
+            return False
+    except:
+        pass
+    if not prediction and prediction not in [0, False]:
+        return False
+    # print("try math_eval")
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+    ## pmatrix (amps)
+    if "pmatrix" in prediction and not 'pmatrix' in reference:
+        reference = str_to_pmatrix(reference)
+    ## deal with [], (), {}
+    pred_str, ref_str = prediction, reference
+    if (prediction.startswith("[") and prediction.endswith("]") and not reference.startswith("(")) or \
+        (prediction.startswith("(") and prediction.endswith(")") and not reference.startswith("[")):
+        pred_str = pred_str.strip("[]()")
+        ref_str = ref_str.strip("[]()")
+    for s in ['{', "}", "(", ")"]:
+        ref_str = ref_str.replace(s, "")
+        pred_str = pred_str.replace(s, "")
+    if pred_str.lower() == ref_str.lower():
+        return True
+    ## [a, b] vs. [c, d], return a==c and b==d
+    if regex.match(r'(\(|\[).+(\)|\])', prediction) is not None and regex.match(r'(\(|\[).+(\)|\])', reference) is not None:
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
+                return True
+    if (prediction.startswith("\\begin{pmatrix}") or prediction.startswith("\\begin{bmatrix}")) and (prediction.endswith("\\end{pmatrix}") or prediction.endswith("\\end{bmatrix}")) and \
+        (reference.startswith("\\begin{pmatrix}") or reference.startswith("\\begin{bmatrix}")) and (reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")):
+        pred_lines = [line.strip() for line in prediction[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]
+        ref_lines = [line.strip() for line in reference[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]
+        matched = True
+        if len(pred_lines) == len(ref_lines):
+            for pred_line, ref_line in zip(pred_lines, ref_lines):
+                pred_parts = pred_line.split("&")
+                ref_parts = ref_line.split("&")
+                if len(pred_parts) == len(ref_parts):
+                    if not all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
+                        matched = False
+                        break
+                else:
+                    matched = False
+                if not matched:
+                    break
+        else:
+            matched = False
+        if matched:
+            return True
+    if prediction.count('=') == 1 and reference.count('=') == 1:
+        pred = prediction.split('=')
+        pred = f"{pred[0].strip()} - ({pred[1].strip()})"
+        ref = reference.split('=')
+        ref = f"{ref[0].strip()} - ({ref[1].strip()})"
+        if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
+            return True
+    elif prediction.count('=') == 1 and len(prediction.split('=')[0].strip()) <= 2 and '=' not in reference:
+        if math_equal(prediction.split('=')[1], reference, include_percentage, is_close):
+            return True
+    elif reference.count('=') == 1 and len(reference.split('=')[0].strip()) <= 2 and '=' not in prediction:
+        if math_equal(prediction, reference.split('=')[1], include_percentage, is_close):
+            return True
+    # print("try final")
+    # symbolic equal with sympy
+    if timeout:
+        if call_with_timeout(symbolic_equal_process, prediction, reference):
+            return True
+    else:
+        if symbolic_equal(prediction, reference):
+            return True
+    return False
+def math_equal_process(param):
+    return math_equal(param[-2], param[-1])
+def numeric_equal(prediction: float, reference: float):
+    # Note that relative tolerance has significant impact
+    # on the result of the synthesized gsm_hard dataset
+    # if reference.is_integer():
+    #     return isclose(reference, round(prediction), abs_tol=1e-4)
+    # else:
+        # prediction = round(prediction, len(str(reference).split(".")[-1]))
+    return isclose(reference, prediction, rel_tol=1e-4)
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr, latex2sympy]:
+            try:
+                return f(s.replace("\\\\", "\\"))
+            except:
+                try:
+                    return f(s)
+                except:
+                    pass
+        return s
+    a = _parse(a)
+    b = _parse(b)
+    # direct equal
+    try:
+        if str(a) == str(b) or a == b:
+            return True
+    except:
+        pass
+    # print("try simplify")
+    # simplify equal
+    try:
+        if a.equals(b) or simplify(a-b) == 0:
+            return True
+    except:
+        pass
+    # print("try equation")
+    # equation equal
+    try:
+        if (abs(a.lhs - a.rhs)).equals(abs(b.lhs - b.rhs)):
+            return True
+    except:
+        pass
+    try:
+        if numeric_equal(float(N(a)), float(N(b))):
+            return True
+    except:
+       pass
+    # matrix
+    try:
+        # if a and b are matrix
+        if a.shape == b.shape:
+            _a = a.applyfunc(lambda x: round(x, 3))
+            _b = b.applyfunc(lambda x: round(x, 3))
+            if _a.equals(_b):
+                return True
+    except:
+        pass
+    return False
+def symbolic_equal_process(a, b, output_queue):
+    result = symbolic_equal(a, b)
+    output_queue.put(result)
+def call_with_timeout(func, *args, timeout=1, **kwargs):
+    output_queue = multiprocessing.Queue()
+    process_args = args + (output_queue,)
+    process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
+    process.start()
+    process.join(timeout)
+    if process.is_alive():
+        process.terminate()
+        process.join()
+        return False
+    return output_queue.get()
+def _test_math_equal():
+    # print(math_equal("0.0833333333333333", "\\frac{1}{12}"))
+    # print(math_equal("(1,4.5)", "(1,\\frac{9}{2})"))
+    # print(math_equal("\\frac{x}{7}+\\frac{2}{7}", "\\frac{x+2}{7}", timeout=True))
+    # print(math_equal("\\sec^2(y)", "\\tan^2(y)+1", timeout=True))
+    # print(math_equal("\\begin{pmatrix}-\\frac{7}{4}&-2\\\\4&\\frac{1}{4}\\end{pmatrix}", "(\\begin{pmatrix}-\\frac{7}{4}&-2\\\\4&\\frac{1}{4}\\\\\\end{pmatrix})", timeout=True))
+    # pred = '\\begin{pmatrix}\\frac{1}{3x^{2/3}}&0&0\\\\0&1&0\\\\-\\sin(x)&0&0\\end{pmatrix}'
+    # gt = '(\\begin{pmatrix}\\frac{1}{3\\sqrt[3]{x}^2}&0&0\\\\0&1&0\\\\-\\sin(x)&0&0\\\\\\end{pmatrix})'
+    # pred= '-\\frac{8x^2}{9(x^2-2)^{5/3}}+\\frac{2}{3(x^2-2)^{2/3}}'
+    # gt= '-\\frac{2(x^2+6)}{9(x^2-2)\\sqrt[3]{x^2-2}^2}'
+    # pred =  '-34x-45y+20z-100=0'
+    # gt = '34x+45y-20z+100=0'
+    # pred = '\\frac{100}{3}'
+    # gt = '33.3'
+    # pred = '\\begin{pmatrix}0.290243531202435\\\\0.196008371385084\\\\-0.186381278538813\\end{pmatrix}'
+    # gt = '(\\begin{pmatrix}0.29\\\\0.196\\\\-0.186\\\\\\end{pmatrix})'
+    # pred = '\\frac{\\sqrt{\\sqrt{11}+\\sqrt{194}}}{2\\sqrt{33}+15}'
+    # gt = '\\frac{\\sqrt{\\sqrt{11}+\\sqrt{194}}}{15+2\\sqrt{33}}'
+    # pred = '(+5)(b+2)'
+    # gt = '(a+5)(b+2)'
+    # pred = '\\frac{1+\\sqrt{5}}{2}'
+    # gt = '2'
+    # pred = '\\frac{34}{16}+\\frac{\\sqrt{1358}}{16}', gt = '4'
+    # pred = '1', gt = '1\\\\sqrt{19}'
+    pred = '(0.6,2.6667]'
+    gt = '(\\frac{3}{5},\\frac{8}{3}]'
+    print(math_equal(pred, gt, timeout=True))
+if __name__ == "__main__":
+    _test_math_equal()

main.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import argparse
+import datetime
+import json
+import logging
+import multiprocessing
+import os
+import re
+from abc import ABC, abstractmethod
+import hjson
+import numpy as np
+import openai
+from tqdm import tqdm
+from sklearn.metrics.pairwise import cosine_similarity
+from data_loader import load_data
+from code_executor import PythonExecutor
+from utils import (Agent, LLMClient, PromptTemplate, api_configs,
+                   extract_and_parse_markup, setup_logging)
+from data_utils import parse_question, parse_ground_truth
+from evaluate import evaluate
+logger = setup_logging()
+class RetrievalAugmentation:
+    # TODO: implement the retrieval augmentation later
+    def __init__(self, dataset, embeddings):
+        self.dataset = dataset
+        self.embeddings = embeddings
+    def get_similar_examples(self, query_embedding, n=3):
+        similarities = cosine_similarity([query_embedding], self.embeddings)[0]
+        top_indices = similarities.argsort()[-n:][::-1]
+        return [self.dataset[i] for i in top_indices]
+class SwiftAgent(Agent):
+    def __init__(self, prompt_template, llm_client, retrieval_augmentation=None):
+        super().__init__(prompt_template, llm_client)
+        self.retrieval_augmentation = retrieval_augmentation
+        self.plans = {}
+        self.codes = {}
+    def generate_response(self, prompt, reasoning, current_solution, plan, critical_feedback, prefill=True):
+        logger.info("SwiftAgent generating response")
+        if self.retrieval_augmentation:
+            query_embedding = self.get_query_embedding(prompt)
+            similar_examples = self.retrieval_augmentation.get_similar_examples(query_embedding)
+            examples_text = "\n".join(similar_examples) # TODO: add more context to the prompt
+        else:
+            examples_text = "No similar examples available."
+        swift_prompt = self.prompt_template.format(
+            "swift",
+            prompt=prompt,
+            current_reasoning=reasoning, # TODO: check if this is needed
+            examples=examples_text,
+            current_solution=current_solution,
+            critical_feedback=critical_feedback,
+            revised_plan=plan
+        )
+        # logger.info(f"SwiftAgent prompt:\n{swift_prompt}")
+        messages = [
+            {"role": "system", "content": ''},
+            {"role": "user", "content": swift_prompt}
+        ]
+        if prefill:
+            messages.append({"role": "assistant", "content": "<plan>"}) # prefix-filling
+        response = self.llm_client.generate_response(messages)
+        if prefill:
+            response = "<plan>" + response
+        try:
+            parsed_response = extract_and_parse_markup(response)
+            return parsed_response
+        except json.JSONDecodeError:
+            logger.error("Error: Swift's response was not in valid JSON format. Returning raw response.")
+            return response
+    def get_query_embedding(self, query):
+        # Implement query embedding generation
+        return np.random.rand(768)  # Placeholder, replace with actual embedding
+class SageAgent(Agent):
+    def __init__(self, prompt_template, llm_client):
+        super().__init__(prompt_template, llm_client)
+        self.feedbacks = {}
+        self.plans = {}
+    def generate_response(self, prompt, reasoning, current_solution, prefill=True):
+        logger.info("SageAgent generating response")
+        sage_prompt = self.prompt_template.format(
+            "sage",
+            prompt=prompt,
+            reasoning=reasoning,
+            current_solution=current_solution
+        )
+        # logger.info(f"SageAgent prompt:\n{sage_prompt}")
+        messages = [
+            {"role": "system", "content": ""},
+            {"role": "user", "content": sage_prompt}
+        ]
+        if prefill:
+            messages.append({"role": "assistant", "content": "<solved>"}) # prefix-filling
+        response = self.llm_client.generate_response(messages)
+        # logger.info(f"SageAgent raw response:\n{response}")
+        if prefill:
+            response = "<solved>" + response
+        try:
+            parsed_response = extract_and_parse_markup(response)
+            return parsed_response
+        except json.JSONDecodeError:
+            logger.error("Error: Sage's response was not in valid JSON format. Returning raw response.")
+            return response
+class RewardModel:
+    def __init__(self, prompt_template, llm_client):
+        self.prompt_template = prompt_template
+        self.llm_client = llm_client
+        self.scores = []
+        self.feedbacks = []
+        self.stagnant_count = 0
+    def calculate_reward(self, problem, reasoning, current_solution, prefill=True):
+        reward_prompt = self.prompt_template.format(
+            "reward",
+            problem=problem,
+            reasoning= reasoning,
+            current_solution=current_solution
+        )
+        # logger.info(f"RewardModel prompt:\n{reward_prompt}")
+        messages = [
+            {"role": "system", "content": ""},
+            {"role": "user", "content": reward_prompt}
+        ]
+        if prefill:
+            messages.append({"role": "assistant", "content": "<feedback>"}) # prefix-filling
+        reward_response = self.llm_client.generate_response(messages)
+        if prefill:
+            reward_response = "<feedback>" + reward_response
+        try:
+            parsed_response = extract_and_parse_markup(reward_response)
+            score = int(parsed_response["score"])
+            # Update stagnant_count based on score comparison
+            if len(self.scores) > 0 and score <= self.scores[-1]:
+                self.stagnant_count += 1
+            else:
+                self.stagnant_count = 0
+            return parsed_response
+        except json.JSONDecodeError:
+            logger.error("Error: Reward model's response was not in valid JSON format. Returning raw response.")
+            return reward_response
+    def should_consult_sage(self):
+        # This method remains unchanged
+        return self.stagnant_count >= 1 or (len(self.scores) > 0 and self.scores[-1] < 5)
+class SwiftSage:
+    def __init__(self, dataset, embeddings, prompt_template_dir, swift_config, sage_config, reward_config, use_retrieval=True, start_with_sage=False):
+        prompt_template = PromptTemplate(prompt_template_dir)
+        retrieval_augmentation = RetrievalAugmentation(dataset, embeddings) if use_retrieval else None
+        # add logger to the following LLMClient
+        swift_llm = LLMClient(**swift_config, logger=logger)
+        sage_llm = LLMClient(**sage_config, logger=logger)
+        reward_llm = LLMClient(**reward_config, logger=logger)
+        self.swift = SwiftAgent(prompt_template, swift_llm, retrieval_augmentation)
+        self.sage = SageAgent(prompt_template, sage_llm)
+        self.reward_model = RewardModel(prompt_template, reward_llm)
+        self.start_with_sage = start_with_sage
+        # self.executor = PythonExecutor(get_answer_from_stdout=True)
+    def solve(self, problem, max_iterations=10, reward_threshold=8):
+        logger.info(f"Starting to solve problem: {problem}")
+        current_solution = "No current solution yet." # final answer
+        current_reasoning = "No reasoning steps yet." # reasoning steps
+        plan = "Initial plan: Take a deep breath and think step by step."
+        critical_feedback = "No critical feedback yet."  # Initialize critical_feedback
+        solved = False
+        for i in range(max_iterations):
+            logger.info(f"Iteration {i+1}")
+            #  Use the Sage Agent
+            if (i == 0 and self.start_with_sage) or self.reward_model.should_consult_sage():
+                sage_parsed = self.sage.generate_response(problem, current_reasoning, current_solution)
+                critical_feedback = sage_parsed["critical_feedback"]
+                # plan = "\n - " + "\n - ".join(sage_parsed["revised_plan"])
+                current_reasoning = sage_parsed["reasoning_steps"]
+                current_code = sage_parsed["code"]
+                solved = sage_parsed["solved"].lower() == "true" if i != 0 else sage_parsed["solved"]
+                if solved:
+                    return current_reasoning, current_solution
+                logger.info(f"Sage's feedback (iteration {i+1}):\n{critical_feedback}")
+                # logger.info(f"Sage's reasoning steps:\n{current_reasoning}")
+                self.sage.feedbacks[i] = critical_feedback
+                # run the code
+                executor = PythonExecutor(get_answer_from_stdout=True)
+                code_result, code_report = executor.apply(current_code)
+                logger.info(f"Sage Code execution report: {code_report}")
+                logger.info(f"Sage Code execution result: {code_result}")
+                current_reasoning = current_reasoning + f"\n\nThe generated code is:\n\n```python\n{current_code}\n```"
+                current_solution = "Answer (from running the code):\n " + code_result
+                # current_solution = sage_parsed["final_answer"]
+                logger.info("Activated Sage, so we should return the reasoning and solution from Sage.")
+                return current_reasoning, current_solution
+            if not solved:
+                # Use the Swift Agent
+                swift_parsed = self.swift.generate_response(problem, current_reasoning, current_solution, plan, critical_feedback)
+                if "code" not in swift_parsed and "final_answer" not in swift_parsed:
+                    logger.info("Swift's response does not contain the 'final_answer' or 'code' field. Returning raw response.")
+                    self.reward_model.scores.append(0)
+                    self.reward_model.feedbacks.append("No feedback")
+                    self.reward_model.stagnant_count += max_iterations # force to use Sage Agent
+                    continue
+                current_plan = swift_parsed["plan"]
+                current_code = swift_parsed["code"]
+                current_answer = swift_parsed.get("final_answer", None)
+                self.swift.plans[i] = current_plan
+                self.swift.codes[i] = current_code
+                logger.info(f"Swift's plan:\n{current_plan}")
+                logger.info(f"Swift's code:\n{current_code}")
+                # Call sandbox to run the code and get the result
+                executor = PythonExecutor(get_answer_from_stdout=True)
+                code_result, code_report = executor.apply(current_code)
+                logger.info(f"Code execution report: {code_report}")
+                logger.info(f"Code execution result: {code_result}")
+                current_reasoning = current_plan + f"\nThe generated code is:\n```python\n{current_code}\n```"
+                current_solution = "Answer (from running the code):\n " + code_result
+                # Calling the reward model to provide feedback and score
+                reward_parsed = self.reward_model.calculate_reward(problem, current_reasoning, current_solution)
+                score = int(reward_parsed["score"])
+                feedback = reward_parsed["feedback"]
+                prev_score = self.reward_model.scores[-1] if len(self.reward_model.scores) > 0 else 0
+                self.reward_model.scores.append(score)
+                self.reward_model.feedbacks.append(feedback)
+                # detect if the score is lower than the previous score
+                logger.info(f"Reward for iteration {i+1}: {score}/10")
+                logger.info(f"Feedback: {feedback}")
+                if False and score < prev_score:
+                    logger.info("Score is lower than the previous score. Stopping the iteration. Reverting to the previous solution and reasoning.")
+                    # revert to the previous solution and reasoning
+                    current_solution = self.swift.codes[i-1]
+                    current_reasoning = self.swift.plans[i-1]
+                    continue
+                critical_feedback = feedback
+            if score >= reward_threshold or solved:
+                logger.info("Perfect solution found!")
+                return current_reasoning, current_solution
+            if self.reward_model.should_consult_sage():
+                logger.info("Reward model: The solution quality hasn't improved recently. Consulting Sage for the next iteration.")
+        logger.info("Max iterations reached without finding a perfect solution.")
+        logger.info("Problem solving completed")
+        return current_reasoning, current_solution
+def run_test(swiftsage, problem, max_iterations=5, reward_threshold=8):
+    logger.info(f"Testing problem: {problem}")
+    reasoning, solution = swiftsage.solve(problem, max_iterations, reward_threshold)
+    logger.info(f"Final reasoning:\n{reasoning}")
+    logger.info(f"Final solution:\n{solution}")
+    logger.info("=" * 50)
+def run_benchmark(swiftsage, args, max_iterations=5, reward_threshold=8):
+    examples = load_data(args.dataset_name, args.split, args.data_dir, args.num_test_sample)
+    res = []
+    skip_ids = []
+    output_path = os.path.join(args.output_path, f"{args.dataset_name}.jsonl")
+    if os.path.exists(output_path):
+        with open(output_path) as fr:
+            model_responses = fr.readlines()
+        for item in model_responses:
+            item = json.loads(item)
+            res.append(item)
+            skip_ids.append(item["idx"])
+    for example in tqdm(examples, desc=args.dataset_name):
+        if example["idx"] in skip_ids:
+            continue
+        question = parse_question(example, args.dataset_name)
+        gt_ans = parse_ground_truth(example, args.dataset_name)
+        reasoning, solution = swiftsage.solve(question, max_iterations, reward_threshold)
+        # TODO: extract answer from solution
+        cur_res = {
+            "idx": example["idx"],
+            "question": question,
+            "gt": gt_ans,
+            "pred": solution,
+            "reasoning": reasoning,
+        }
+        res.append(cur_res)
+        with open(output_path, "a") as fw:
+            fw.write(json.dumps(res[-1]) + "\n")
+    # Evaluate the results
+    res, result_metric = evaluate(res)
+    with open(args.output_path, f"{args.dataset_name}_score.jsonl", "w") as fw:
+        for item in res:
+            fw.write(json.dumps(item) + "\n")
+    with open(args.output_path, f"{args.dataset_name}_metric.jsonl", "w") as fw:
+        fw.write(json.dumps(result_metric) + "\n")
+def main(args):
+    # TODO: for retrieval augmentation (not implemented yet now)
+    # dataset = ["Example problem 1: ...", "Example problem 2: ...", "Example problem 3: ..."]
+    # embeddings = np.random.rand(len(dataset), 768)  # Placeholder, replace with actual embeddings
+    # Configuration for each LLM
+    # swift_config = {
+    #     "model_id": "Meta-Llama-3.1-8B-Instruct",
+    #     "api_config": api_configs['SambaNova']
+    # }
+    # reward_config = {
+    #     "model_id": "Meta-Llama-3.1-70B-Instruct",
+    #     "api_config": api_configs['SambaNova']
+    # }
+    # sage_config = {
+    #     "model_id": "Meta-Llama-3.1-405B-Instruct",
+    #     "api_config": api_configs['SambaNova']
+    # }
+    swift_config = {
+        "model_id": args.swift_model_id,
+        "api_config": api_configs[args.api_provider]
+    }
+    reward_config = {
+        "model_id": args.reward_model_id,
+        "api_config": api_configs[args.api_provider]
+    }
+    sage_config = {
+        "model_id": args.sage_model_id,
+        "api_config": api_configs[args.api_provider]
+    }
+    # specify the path to the prompt templates
+    prompt_template_dir = args.prompt_template_dir
+    dataset = []
+    embeddings = [] # TODO: for retrieval augmentation (not implemented yet now)
+    s2 = SwiftSage(
+        dataset,
+        embeddings,
+        prompt_template_dir,
+        swift_config,
+        sage_config,
+        reward_config,
+        use_retrieval=args.use_retrieval,
+        start_with_sage=args.start_with_sage,
+    )
+    if args.eval_mode == "test":
+        test_problems = [
+            "Solve the equation: 2x + 5 = 13", # 0
+            "If h(x)=x-4 and g(h(x))=x^2-8x+10, find g(x)? show the formula for g(x)", # 1
+            "Solve the equation: 6y + 5 = 29", # 2
+            "Who lives longer, Lowell Sherman or Jonathan Kaplan?", # 3
+            "9.9 or 9.11 --  which is bigger?", # 4
+            "How can you solve the quadratic equation 3x^2 + 7x + 4 = 0 using the quadratic formula?", # 5
+            "Explain why sound waves cannot travel in a vacuum?", # 6
+            "How many grams of hydrogen (H) are present in 23.5 grams of water (H2O)?", # 7
+            "What is the distance between the points (2, 3) and (5, 8)?", # 8
+            "Why can the Hubble telescope capture clear images of distant stars and galaxies, but not a detailed image of Pluto?", # 9
+            """A rectangular band formation is a formation with $m$ band members in each of $r$ rows, where $m$ and $r$ are integers. A particular band has less than 100 band members. The director arranges them in a rectangular formation and finds that he has two members left over. If he increases the number of members in each row by 1 and reduces the number of rows by 2, there are exactly enough places in the new formation for each band member. What is the largest number of members the band could have?""",
+            """Tim wants to invest some money in a bank which compounds quarterly with an annual interest rate of $7\%$. To the nearest dollar, how much money should he invest if he wants a total of $\$60,\!000$ at the end of $5$ years?""",
+            """In an SR latch built from NOR gates, which condition is not allowed
+            Options:
+            [ "S=0, R=2", "S=2, R=2", "S=1, R=1", "S=1, R=-1", "S=1, R=2", "S=0, R=0", "S=2, R=0", "S=1, R=0", "S=2, R=1", "S=0, R=1" ]
+            Which one is the correct answer?""",
+            # ... add other problems here ...
+            """How many letter r are there in the word "strawberry"?"""
+        ]
+        # for problem in test_problems:
+        pid = 7
+        print(f"Problem {pid}: {test_problems[pid]}")
+        run_test(s2, test_problems[pid], args.max_iterations, args.reward_threshold)
+    elif args.eval_mode == "benchmark":
+        run_benchmark(s2, args, args.max_iterations, args.reward_threshold)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--eval_mode", default="test", choices=["test", "benchmark"], type=str)
+    parser.add_argument("--dataset_name", default="MATH", type=str)
+    parser.add_argument("--data_dir", default="./data", type=str)
+    parser.add_argument("--split", default="test", type=str)
+    parser.add_argument("--num_test_sample", default=-1, type=int)  # -1 for full data
+    parser.add_argument("--api_provider", default="Together", choices=["Together", "SambaNova"], type=str)
+    parser.add_argument("--swift_model_id", default="meta-llama/Meta-Llama-3-8B-Instruct-Turbo", type=str)
+    parser.add_argument("--reward_model_id", default="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", type=str)
+    parser.add_argument("--sage_model_id", default="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", type=str)
+    parser.add_argument("--prompt_template_dir", default='./prompt_templates', type=str)
+    parser.add_argument("--use_retrieval", action="store_true")
+    parser.add_argument("--start_with_sage", action="store_true")
+    parser.add_argument("--max_iterations", default=5, type=int)
+    parser.add_argument("--reward_threshold", default=8, type=int)
+    parser.add_argument("--save_outputs", action="store_true")
+    parser.add_argument("--output_path", default="./output", type=str)
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+    # remove console output for benchmark evaluation
+    if args.eval_mode != "test":
+        root_logger = logging.getLogger("")
+        for handler in root_logger.handlers:
+            if isinstance(handler, logging.StreamHandler):
+                root_logger.removeHandler(handler)
+                break
+    if args.api_provider == "SambaNova":
+        args.swift_model_id = args.swift_model_id.split("/")[-1][:-len("Turbo")]
+        args.reward_model_id = args.reward_model_id.split("/")[-1][:-len("Turbo")]
+        args.sage_model_id = args.sage_model_id.split("/")[-1][:-len("Turbo")]
+    multiprocessing.set_start_method('spawn')
+    main(args)

prompt_templates/reward_template.md ADDED Viewed

	@@ -0,0 +1,44 @@

+# Instruction
+You are a reward model. You will be given a problem, a solution. You will then evaluate the solution based on the criteria provided.
+## Problem
+<problem>
+## Current Solution
+### Reasoning Steps
+<reasoning>
+### Final Answer
+<current_solution>
+## Your Evaluation
+We are not sure if the current solution is correct. Please evaluate the current solution based on the following criteria:
+1. Correctness
+2. Completeness
+Provide a score from 1 to 10 and a brief explanation.
+If you are not sure about the final answer, provide a score between 1 to 7 and explain why you are not sure about the final answer.
+Take care and do not give false information in the critical feedback.
+## Output Format
+Remember to present your output in the following format:
+<feedback>
+Your critical feedback here.
+</feedback>
+<score>
+Your score here.
+</score>
+# Important Notes
+You must follow the format strictly, do not miss any field. Start your output by "<feedback>" and end your output by "</score>".

prompt_templates/sage_template.md ADDED Viewed

	@@ -0,0 +1,45 @@

+# Instruction
+You are a high-level problem-solving agent. You will be given a problem and a current solution. You will then provide a critical feedback on the current solution and suggest a revised plan if needed.
+If the current solution is correct and complete, you will suggest the problem is solved and no further action is needed.
+## Problem
+<prompt>
+## Current Solution
+### Reasoning Steps
+<reasoning>
+### Final Answer
+<current_solution>
+## Critical Feedback
+We are not sure if the current solution is correct, can you provide a critical feedback on the current solution and suggest a revised plan for the next steps. Consider any challenges or improvements needed.
+If the solution and answer are correct, please set `solved` to `"True"`, and leave `critical_feedback` and `reasoning_steps` empty.
+Please point out the errors in the current solution if there are any in the `critical_feedback` field, and then provide the revised plan in the `reasoning_steps` field, and finally provide the final answer in the `final_answer` field.
+Format your response in the following format:
+<solved>
+[True or False]
+</solved>
+<critical_feedback>
+[Your critical feedback here.]
+</critical_feedback>
+<reasoning_steps>
+[Put your reasoning steps here to revise the previous solution. Use additional knowledge if needed and then we will write the code to solve the problem in the next field.]
+</reasoning_steps>
+<code>
+[Put your updated code here to solve the problem.]
+</code>

prompt_templates/swift_template.md ADDED Viewed

	@@ -0,0 +1,152 @@

+# Instruction
+## Similar Examples with Solutions
+### Example Task 1
+<task>
+Convert the point $(0, -3 \sqrt{3}, 3)$ in rectangular coordinates to spherical coordinates.  Enter your answer in the form $(\rho,\theta,\phi),$ where $\rho > 0,$ $0 \le \theta < 2 \pi,$ and $0 \le \phi \le \pi.$
+</task>
+<plan>
+Step 1. Recall the formulas for converting from rectangular coordinates $(x, y, z)$ to spherical coordinates $(\rho, \theta, \phi)$:
+   - $\rho = \sqrt{x^2 + y^2 + z^2}$
+   - $\theta = \arctan2(y, x)$
+   - $\phi = \arccos\left(\frac{z}{\rho}\right)$
+Step 2. Given point: $(0, -3\sqrt{3}, 3)$
+   $x = 0$
+   $y = -3\sqrt{3}$
+   $z = 3$
+Step 3. Calculate $\rho$ using the formula.
+Step 4. Calculate $\theta$:
+   - Since $x = 0$, we need to handle this special case.
+   - When $x = 0$ and $y < 0$, $\theta = \frac{3\pi}{2}$
+Step 5. Calculate $\phi$ using the formula.
+Step 6. Ensure $\theta$ is in the range $[0, 2\pi)$ and $\phi$ is in the range $[0, \pi]$.
+</plan>
+<code>
+from sympy import sqrt, atan2, acos, pi
+def rectangular_to_spherical():
+    x, y, z = 0, -3*sqrt(3), 3
+    rho = sqrt(x**2 + y**2 + z**2)
+    theta = atan2(y, x)
+    phi = acos(z/rho)
+    return rho, theta, phi
+spherical_coordinates = rectangular_to_spherical()
+print(spherical_coordinates)
+</code>
+<final_answer>
+(6, -pi/2, pi/3)
+</final_answer>
+### Example Task 2
+<task>
+Determine who lived longer between Lowell Sherman and Jonathan Kaplan.
+</task>
+<plan>
+Step 1: Research the birth and death dates of Lowell Sherman.
+Step 2: Research the birth and death dates of Jonathan Kaplan.
+Step 3: Calculate the lifespan of each person in years.
+Step 4: Compare the lifespans to determine who lived longer.
+</plan>
+<code>
+from datetime import datetime
+def calculate_lifespan(birth_date, death_date):
+    birth = datetime.strptime(birth_date, "%Y-%m-%d")
+    death = datetime.strptime(death_date, "%Y-%m-%d")
+    return (death - birth).days / 365.25
+def compare_lifespans():
+    lowell_sherman = calculate_lifespan("1885-10-11", "1934-12-28")
+    jonathan_kaplan = calculate_lifespan("1947-11-25", "2021-01-03")
+    if lowell_sherman > jonathan_kaplan:
+        return "Lowell Sherman"
+    elif jonathan_kaplan > lowell_sherman:
+        return "Jonathan Kaplan"
+    else:
+        return "They lived equally long"
+result = compare_lifespans()
+print(f"{result} lived longer.")
+</code>
+<final_answer>
+Jonathan Kaplan lived longer.
+</final_answer>
+---
+## Important Notes
+Note that the above are some example tasks and output formats. You need to solve the current problem below.
+---
+## Current problem that we want to solve
+<task>
+<prompt>
+</task>
+## Previous Solution
+### Previous Reasoning Steps
+<plan>
+<current_reasoning>
+</plan>
+### Previous Answer
+<final_answer>
+<current_solution>
+</final_answer>
+---
+## Critical Feedback
+<critical_feedback>
+### Suggested Plan
+<revised_plan>
+---
+## Your Final Solution
+Read the current problem in <task>...</task> again.
+<task>
+<prompt>
+</task>
+To solve the current problem, you should first write the overall plan in <plan>...</plan> to solve the problem. Then, write python code in <code>...</code> tags to solve the problem.  If there is critical feedback and suggested plan, please revise your previous solution (if any) and provide the new plan and solution to solve the problem based on the critical feedback and suggested plan.
+## Remember to present your output in the following format:
+<plan>
+[Your general plan to solve the problem by using code. You can recall the required knowledge that you can use in the code, such as the facts, formulas, etc.]
+</plan>
+<code>
+[Your python code to solve the current problem (instead of the example problems). Please print the final answer at the end of the code.]
+</code>
+You must follow the format strictly, do not miss any field.
+Start your output by "<plan>...</plan>" and end your output by "<code> ... </code>".

run_eval.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+DEBUG_MODE="-m debugpy --listen 127.0.0.1:5679 --wait-for-client"
+python $DEBUG_MODE main.py \
+    --eval_mode benchmark \
+    --dataset_name MATH \
+    --num_test_sample 4 \

test.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from code_executor import PythonExecutor
+import multiprocess
+if __name__ == '__main__':
+    multiprocess.set_start_method('spawn')
+    current_code = """
+```python
+def calculate_hydrogen_mass(mass_of_water_grams):
+    mass_of_hydrogen = 1.00794  # g/mol
+    mass_of_water = 18.01528  # g/mol
+    ratio = (2 * mass_of_hydrogen) / mass_of_water
+    return ratio * mass_of_water_grams
+mass_of_water = 23.5  # grams
+hydrogen_mass = calculate_hydrogen_mass(mass_of_water)
+print(hydrogen_mass)
+```
+    """
+    executor = PythonExecutor(get_answer_from_stdout=True)
+    result, report = executor.apply(current_code)
+    print("Result:", result)
+    print("Report:", report)
+    # Make sure to close the pool when done
+    executor.pool.close()
+    executor.pool.join()

utils.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import datetime
+import json
+import logging
+import os
+import re
+from abc import ABC, abstractmethod
+import dirtyjson
+import hjson
+import numpy as np
+import openai
+from fuzzywuzzy import process
+from sklearn.metrics.pairwise import cosine_similarity
+api_configs = {
+    "SambaNova": {
+        "api_key": os.environ.get("SAMBANOVA_API_KEY"),
+        "url_base": "https://api.sambanova.ai/v1"
+    },
+    "Together": {
+        "api_key": os.environ.get("TOGETHER_API_KEY"),
+        "url_base": "https://api.together.xyz/v1"
+    }
+    # You can add more API configurations here for other providers
+}
+class Agent(ABC):
+    def __init__(self, prompt_template, llm_client):
+        self.prompt_template = prompt_template
+        self.llm_client = llm_client
+    @abstractmethod
+    def generate_response(self, prompt):
+        pass
+def setup_logging():
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_filename = f"logs/swiftsage_log_{timestamp}.txt"
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        filename=log_filename,
+        filemode='w'
+    )
+    # Also print to console
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('').addHandler(console)
+    return logging.getLogger('SwiftSage')
+def extract_and_parse_markup(text):
+    keys = ["reasoning_steps", "final_answer", "feedback", "score", "critical_feedback", "revised_plan", "solved", "plan", "code"]
+    result = {}
+    if "<final_answer>" in text and "</final_answer>" not in text:
+        text = text + "</final_answer>"
+    for key in keys:
+        # Create a pattern for each key
+        pattern = f'<{key}>(.*?)</{key}>'
+        # Search for the pattern in the text
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            # Extract the content, strip whitespace, and add to the result
+            content = match.group(1).strip()
+            result[key] = content
+    if "code" in result.keys():
+        result["code"] = result["code"].replace("```python", "").replace("```", "").strip()
+    return result
+class PromptTemplate:
+    def __init__(self, template_dir):
+        self.template_dir = template_dir
+        self.templates = {}
+        self.load_templates()
+    def load_templates(self):
+        for filename in ['swift_template.md', 'sage_template.md', 'reward_template.md']:
+            with open(os.path.join(self.template_dir, filename), 'r') as f:
+                key = filename.split('_')[0]
+                self.templates[key] = f.read()
+    def format(self, key, **kwargs):
+        template = self.templates.get(key, "")
+        for k, v in kwargs.items():
+            template = template.replace("<" + k + ">", str(v))
+        return template
+class LLMClient:
+    def __init__(self, model_id, api_config, temperature=0.3, top_p=1.0, max_tokens=3000, logger=None):
+        self.client = openai.OpenAI(
+            api_key=api_config['api_key'],
+            base_url=api_config['url_base']
+        )
+        self.model_id = model_id
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.logger = logger
+    def generate_response(self, messages):
+        self.logger.info(f"Sending request to {self.model_id}")
+        self.logger.info(f"Messages: {messages}")
+        response = self.client.chat.completions.create(
+            model=self.model_id,
+            messages=messages,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_tokens=self.max_tokens
+        )
+        content = response.choices[0].message.content
+        self.logger.info(f"Response from {self.model_id}:\n{content}")
+        return content
+if __name__ == "__main__":
+    test_text = "test"
+    print(extract_and_parse_markup(test_text))
+"""
+def extract_and_parse_json(text):
+    keys_and_types = [
+        ("reasoning_steps", list),
+        ("final_answer", str),
+        ("feedback", str),
+        ("score", str),
+        ("score", int),
+        ("feedback", str),
+        ("solved", str),
+        ("critical_feedback", str),
+        ("revised_plan", list),
+    ]
+    # Try to parse the JSON first
+    try:
+        # find the first and last curly braces and parse the json
+        first_brace = text.find("{")
+        last_brace = text.rfind("}")
+        if last_brace == -1:
+            text = text + "\"}"
+        if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
+            data = json.loads(text[first_brace:last_brace+1])
+        return data
+    except Exception as e:
+        data = {}
+        try:
+            data = dirtyjson.loads(text)
+        except Exception as e:
+            pass
+        # If JSON parsing fails, use regex to extract key-value pairs
+        for key, _ in keys_and_types:
+            # pattern = rf'"{key}"\s*:\s*([\[{{].*?[\]}}]|".*?")'
+            pattern = rf'"{key}"\s*:\s*([\[{{].*?[\]}}]|".*?"|[-+]?\d+)'
+            match = re.search(pattern, text, re.DOTALL)
+            if match:
+                try:
+                    value = json.loads(match.group(1))
+                except Exception as e:
+                    value = match.group(1).strip('"')
+                data[key] = value
+    result = {}
+    for key, expected_type in keys_and_types:
+        if key in result.keys() and result[key] is not None:
+            continue
+        # Use fuzzy matching to find the closest key
+        try:
+            closest_key, score = process.extractOne(key, data.keys())
+        except Exception as e:
+            continue
+        if score > 80:  # You can adjust this threshold
+            value = data[closest_key]
+            # Type checking and conversion
+            if expected_type == list and isinstance(value, str):
+                value = [item.strip() for item in value.strip('[]').split(',')]
+            elif expected_type == str and isinstance(value, list):
+                value = ', '.join(value)
+            elif expected_type == int and value is not None:
+                try:
+                    value = int(value)
+                except ValueError:
+                    value = None
+            result[key] = value
+        else:
+            result[key] = None
+    for key in list(result.keys()):
+        if result[key] is None:
+            del result[key]
+    return result
+def extract_and_parse_json_v1(text):
+    def find_json_objects(s):
+        # Find all substrings that look like JSON objects
+        json_like_strs = re.findall(r'\{(?:[^{}]|\{[^{}]*\})*\}', s)
+        return json_like_strs
+    def try_parse_json(s):
+        try:
+            return json.loads(s)
+        except json.JSONDecodeError:
+            try:
+                s = s.replace("\n", "")
+                return hjson.loads(s)
+            except json.JSONDecodeError:
+                return None
+            return None
+    # First, try to find JSON within code blocks
+    code_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
+    code_blocks = re.findall(code_block_pattern, text, re.IGNORECASE)
+    all_json_candidates = []
+    # Add JSON candidates from code blocks
+    for block in code_blocks:
+        all_json_candidates.extend(find_json_objects(block))
+    # Add JSON candidates from the entire text
+    all_json_candidates.extend(find_json_objects(text))
+    # Sort candidates by length, descending
+    all_json_candidates.sort(key=len, reverse=True)
+    # Try to parse each candidate
+    for candidate in all_json_candidates:
+        parsed_json = try_parse_json(candidate)
+        if parsed_json is not None:
+            return parsed_json
+    raise ValueError("No valid JSON object found in the text")
+"""