bigcodebench-evaluator-2

Running

App Files Files Community

Terry Zhuo commited on Aug 16

Commit

de45929

•

1 Parent(s): 5489a0a

revert back to eval only

Browse files

Files changed (3) hide show

_app.py +648 -0
app.py +152 -622
demo.py +0 -178

_app.py ADDED Viewed

	@@ -0,0 +1,648 @@

+import os
+import logging
+import time
+import datetime
+import gradio as gr
+from threading import Thread
+import datasets
+from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+from apscheduler.schedulers.background import BackgroundScheduler
+# Start ephemeral Spaces on PRs (see config in README.md)
+from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
+from src.display.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    # INTRODUCTION_TEXT,
+    TITLE,
+    ABOUT_TEXT,
+    SUBMISSION_TEXT_3,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    fields,
+    EvalQueueColumn
+)
+from src.envs import (
+    API,
+    EVAL_REQUESTS_PATH,
+    RESULT_REPO,
+    DATA_VERSION,
+    DATA_REPO,
+    HARD_RESULT_REPO,
+    ELO_REPO,
+    HARD_ELO_REPO,
+    SOLVE_REPO,
+    HARD_SOLVE_REPO,
+    HF_TOKEN,
+    QUEUE_REPO,
+    REPO_ID,
+    VOTES_REPO,
+    VOTES_PATH,
+    HF_HOME,
+)
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.execute import generate_command, is_running, lock, stream_logs, find_result_file
+from src.tools.plots import plot_elo_mle, plot_solve_rate
+# from src.voting.vote_system import VoteManager, run_scheduler
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Start ephemeral Spaces on PRs (see config in README.md)
+from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
+# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
+# This controls whether a full initialization should be performed.
+DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
+NEW_DATA_ON_LEADERBOARD = True
+LEADERBOARD_DF = None
+HARD_LEADERBOARD_DF = None
+ELO_TASK_DF = None
+ELO_BENCH_DF = None
+HARD_ELO_TASK_DF = None
+HARD_ELO_BENCH_DF = None
+COMPLETE_SOLVE_DF = None
+INSTRUCT_SOLVE_DF = None
+HARD_COMPLETE_SOLVE_DF = None
+HARD_INSTRUCT_SOLVE_DF = None
+DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
+                             verification_mode="no_checks")
+def filter_data(data, keyword):
+    if not keyword:
+        return data
+    filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
+    return filtered_data
+def update_display(search_keyword, index, show_test):
+    filtered_data = filter_data(DATA, search_keyword)
+    if not filtered_data:
+        return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
+    max_index = len(filtered_data) - 1
+    index = min(max(0, index), max_index)
+    task_id = filtered_data[index]['task_id']
+    snippet1 = filtered_data[index]['complete_prompt']
+    snippet2 = filtered_data[index]['instruct_prompt']
+    # snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
+    snippet4 = filtered_data[index]['test'] if show_test else ""
+    return [
+        task_id,
+        snippet1,
+        snippet2,
+        # snippet3,
+        snippet4,
+        len(filtered_data),
+        gr.update(maximum=max_index, value=index)
+    ]
+def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
+def time_diff_wrapper(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        diff = end_time - start_time
+        logging.info(f"Time taken for {func.__name__}: {diff} seconds")
+        return result
+    return wrapper
+@time_diff_wrapper
+def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
+    """Download dataset with exponential backoff retries."""
+    attempt = 0
+    while attempt < max_attempts:
+        try:
+            logging.info(f"Downloading {repo_id} to {local_dir}")
+            snapshot_download(
+                repo_id=repo_id,
+                local_dir=local_dir,
+                repo_type=repo_type,
+                tqdm_class=None,
+                etag_timeout=30,
+                max_workers=8,
+            )
+            logging.info("Download successful")
+            return
+        except Exception as e:
+            wait_time = backoff_factor**attempt
+            logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
+            time.sleep(wait_time)
+            attempt += 1
+    raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
+def get_latest_data_leaderboard(
+    leaderboard_initial_df = None,
+    hard_leaderboard_initial_df = None,
+    elo_task_df = None,
+    elo_bench_df = None,
+    hard_elo_task_df = None,
+    hard_elo_bench_df = None,
+    complete_solve_df = None,
+    instruct_solve_df = None,
+    hard_complete_solve_df = None,
+    hard_instruct_solve_df = None
+    ):
+    global NEW_DATA_ON_LEADERBOARD
+    global LEADERBOARD_DF
+    global HARD_LEADERBOARD_DF
+    global ELO_TASK_DF
+    global ELO_BENCH_DF
+    global HARD_ELO_TASK_DF
+    global HARD_ELO_BENCH_DF
+    global COMPLETE_SOLVE_DF
+    global INSTRUCT_SOLVE_DF
+    global HARD_COMPLETE_SOLVE_DF
+    global HARD_INSTRUCT_SOLVE_DF
+    if NEW_DATA_ON_LEADERBOARD:
+        print("Leaderboard updated at reload!")
+        leaderboard_dataset = datasets.load_dataset(
+            RESULT_REPO,
+            "default",
+            split="train",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        )
+        LEADERBOARD_DF = get_leaderboard_df(
+            leaderboard_dataset=leaderboard_dataset,
+            cols=COLS,
+        )
+        hard_leaderboard_dataset = datasets.load_dataset(
+            HARD_RESULT_REPO,
+            "default",
+            split="train",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        )
+        hard_leaderboard_df = get_leaderboard_df(
+            leaderboard_dataset=hard_leaderboard_dataset,
+            cols=COLS,
+        )
+        HARD_LEADERBOARD_DF = hard_leaderboard_df
+        elo_task_df = datasets.load_dataset(
+            ELO_REPO,
+            "default",
+            split="task_no_tie",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        elo_bench_df = datasets.load_dataset(
+            ELO_REPO,
+            "default",
+            split="benchmark_tie",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        ELO_TASK_DF = elo_task_df
+        ELO_BENCH_DF = elo_bench_df
+        hard_elo_task_df = datasets.load_dataset(
+            HARD_ELO_REPO,
+            "default",
+            split="task_no_tie",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        hard_elo_bench_df = datasets.load_dataset(
+            HARD_ELO_REPO,
+            "default",
+            split="benchmark_tie",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        HARD_ELO_TASK_DF = hard_elo_task_df
+        HARD_ELO_BENCH_DF = hard_elo_bench_df
+        complete_solve_df = datasets.load_dataset(
+            SOLVE_REPO,
+            "default",
+            split="complete",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        instruct_solve_df = datasets.load_dataset(
+            SOLVE_REPO,
+            "default",
+            split="instruct",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        COMPLETE_SOLVE_DF = complete_solve_df
+        INSTRUCT_SOLVE_DF = instruct_solve_df
+        hard_complete_solve_df = datasets.load_dataset(
+            HARD_SOLVE_REPO,
+            "default",
+            split="complete",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        hard_instruct_solve_df = datasets.load_dataset(
+            HARD_SOLVE_REPO,
+            "default",
+            split="instruct",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
+        HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
+        NEW_DATA_ON_LEADERBOARD = False
+    else:
+        LEADERBOARD_DF = leaderboard_initial_df
+        # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
+        ELO_TASK_DF = elo_task_df
+        # ELO_BENCH_DF = elo_bench_df
+        # HARD_ELO_TASK_DF = hard_elo_task_df
+        HARD_ELO_BENCH_DF = hard_elo_bench_df
+        COMPLETE_SOLVE_DF = complete_solve_df
+        # INSTRUCT_SOLVE_DF = instruct_solve_df
+        # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
+        HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
+    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+    # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+def init_space():
+    """Initializes the application space, loading only necessary data."""
+    # Always redownload the leaderboard DataFrame
+    global LEADERBOARD_DF
+    global HARD_LEADERBOARD_DF
+    global ELO_TASK_DF
+    global ELO_BENCH_DF
+    global HARD_ELO_TASK_DF
+    global HARD_ELO_BENCH_DF
+    global COMPLETE_SOLVE_DF
+    global INSTRUCT_SOLVE_DF
+    global HARD_COMPLETE_SOLVE_DF
+    global HARD_INSTRUCT_SOLVE_DF
+    LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
+    # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
+    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+    # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+# Initialize VoteManager
+# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
+# Schedule the upload_votes method to run every 15 minutes
+# schedule.every(15).minutes.do(vote_manager.upload_votes)
+# Start the scheduler in a separate thread
+# scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
+# scheduler_thread.start()
+# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
+# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
+ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
+COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
+HARD_INSTRUCT_SOLVE_DF = init_space()
+# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
+# Data processing for plots now only on demand in the respective Gradio tab
+# def load_and_create_plots():
+#     plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
+#     return plot_df
+# Function to check if a user is logged in
+def check_login(profile: gr.OAuthProfile | None) -> bool:
+    if profile is None:
+        return False
+    return True
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
+            ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
+            ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
+            ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
+        ],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+        )
+def init_others(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Gradio DataFrame is empty or None.")
+    return gr.Dataframe(dataframe, visible=False)
+main_block = gr.Blocks(css=custom_css)
+with main_block as demo:
+    with gr.Row(elem_id="header-row"):
+        gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
+    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.Tab("💎 Hard Set") as hard_tabs:
+            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
+                hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
+                gr.Markdown(
+                    """
+                **Notes:**
+                - For the efficiency reasons, we only display the Hard Set leaderboard.
+                - _Hard Set_ vs _Full Set_:
+                    - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
+                    - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
+                - _Complete_ vs _Instruct_:
+                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
+                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
+                - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
+                - `Average` is the average of `Complete` and `Instruct` when both are available.
+                - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
+                - `#Act Params (B)` is the number of activated model parameters during inference.
+                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
+                - For more details check the 📝 About section.
+                """,
+                    elem_classes="markdown-text",
+                )
+            with gr.TabItem("📊 Elo Rating", id="hard_elo"):
+                with gr.Column():
+                    with gr.Group():
+                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
+                        hard_task_elo_map = gr.Plot()
+                        hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
+                        demo.load(plot_elo_mle, [hard_elo_task_gr],
+                                    hard_task_elo_map)
+                    with gr.Group():
+                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
+                        hard_bench_elo_map = gr.Plot()
+                        hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
+                        demo.load(plot_elo_mle, [hard_elo_bench_gr],
+                                    hard_bench_elo_map)
+            with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
+                with gr.Column():
+                    hard_complete_map = gr.Plot()
+                    hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
+                    demo.load(plot_solve_rate, [hard_complete_solve_gr,
+                                                gr.Textbox("Complete", visible=False),
+                                                gr.Number(10, visible=False),
+                                                gr.Number(16, visible=False),
+                                                ], hard_complete_map)
+                    hard_instruct_map = gr.Plot()
+                    hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
+                    demo.load(plot_solve_rate, [hard_instruct_solve_gr,
+                                                gr.Textbox("Instruct", visible=False),
+                                                gr.Number(10, visible=False),
+                                                gr.Number(16, visible=False),
+                                                ], hard_instruct_map)
+        with gr.Tab("🎯 Full Set") as full_tabs:
+            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
+                leaderboard = init_leaderboard(LEADERBOARD_DF)
+                gr.Markdown(
+                    """
+                **Notes:**
+                - _Complete_ vs _Instruct_:
+                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
+                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
+                - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
+                - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
+                - `size` is the amount of activated model weight during inference.
+                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
+                - For more details check the 📝 About section.
+                """,
+                    elem_classes="markdown-text",
+                )
+            with gr.TabItem("📊 Elo Rating", id="full_elo"):
+                with gr.Column():
+                    with gr.Group():
+                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
+                        task_elo_map = gr.Plot()
+                        elo_task_gr = init_others(ELO_TASK_DF)
+                        demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
+                    with gr.Group():
+                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
+                        bench_elo_map = gr.Plot()
+                        elo_bench_gr = init_others(ELO_BENCH_DF)
+                        demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
+            with gr.TabItem("🧩 Solve Rate", id="full_solve"):
+                with gr.Column():
+                    complete_map = gr.Plot()
+                    complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
+                    demo.load(plot_solve_rate, [complete_solve_gr,
+                                                gr.Textbox("Complete", visible=False),
+                                                ], complete_map)
+                    instruct_map = gr.Plot()
+                    instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
+                    demo.load(plot_solve_rate, [instruct_solve_gr,
+                                                gr.Textbox("Instruct", visible=False),
+                                                ], instruct_map)
+        with gr.TabItem("📝 About", id=3):
+            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🔎 Data Viewer", id="viewer"):
+            search_input = gr.Textbox(label="Search by keyword")
+            count_output = gr.Number(label="Number of filtered items")
+            index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
+            # show_solution = gr.Checkbox(label="Show Solution")
+            show_test = gr.Checkbox(label="Show Test Cases")
+            update_button = gr.Button("Update")
+            task_id_output = gr.Textbox(label="Task ID")
+            code_completion = gr.Code(language="python", label="Code Completion")
+            nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
+            # solution = gr.Code(language="python", label="Solution")
+            test_cases = gr.Code(language="python", label="Test Cases")
+            update_button.click(
+                update_display,
+                inputs=[search_input, index_slider, show_test],
+                outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
+            )
+            # Initial load
+            demo.load(
+                update_display,
+                inputs=[search_input, index_slider, show_test],
+                outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
+            )
+        with gr.TabItem("🚀 Request", id=4):
+            gr.Markdown(SUBMISSION_TEXT_3)
+        with gr.TabItem("🛠️ Execute", id=5):
+            gr.Markdown("# BigCodeBench Evaluator")
+            with gr.Row():
+                jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
+                split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
+                subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
+            with gr.Row():
+                parallel = gr.Number(label="Parallel (optional)", precision=0)
+                min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
+                max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
+            with gr.Row():
+                max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
+                max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
+                check_gt_only = gr.Checkbox(label="Check GT Only")
+                no_gt = gr.Checkbox(label="No GT")
+            command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
+            with gr.Row():
+                submit_btn = gr.Button("Run Evaluation")
+                download_btn = gr.DownloadButton(label="Download Result")
+            log_output = gr.Textbox(label="Execution Logs", lines=20)
+            input_components = [
+                jsonl_file, split, subset, parallel,
+                min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
+                check_gt_only, no_gt
+            ]
+            for component in input_components:
+                component.change(generate_command, inputs=input_components, outputs=command_output)
+            def start_evaluation(command, jsonl_file, subset, split):
+                extra = subset + "_" if subset != "full" else ""
+                if jsonl_file is not None:
+                    result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
+                else:
+                    result_path = None
+                for log in stream_logs(command, jsonl_file):
+                    if jsonl_file is not None:
+                        yield log, gr.update(value=result_path, label=result_path), gr.update()
+                    else:
+                        yield log, gr.update(), gr.update()
+                is_running = False
+                result_file = find_result_file()
+                if result_file:
+                    return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
+                            # gr.Button(visible=False)#,
+                            # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
+                else:
+                    return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
+                            # gr.Button("Run Evaluation", visible=True),
+                            # gr.DownloadButton(visible=False))
+            submit_btn.click(start_evaluation,
+                        inputs=[command_output, jsonl_file, subset, split],
+                        outputs=[log_output, download_btn])
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+    main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
+    # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
+    # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
+    # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
+main_block.queue(default_concurrency_limit=100)
+def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
+    # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
+    # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
+    # ht to Lucain!
+    if SPACE_ID is None:
+        print("Not in a Space: Space CI disabled.")
+        return WebhooksServer(ui=main_block)
+    if IS_EPHEMERAL_SPACE:
+        print("In an ephemeral Space: Space CI disabled.")
+        return WebhooksServer(ui=main_block)
+    card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
+    config = card.data.get("space_ci", {})
+    print(f"Enabling Space CI with config from README: {config}")
+    return configure_space_ci(
+        blocks=ui,
+        trusted_authors=config.get("trusted_authors"),
+        private=config.get("private", "auto"),
+        variables=config.get("variables", "auto"),
+        secrets=config.get("secrets"),
+        hardware=config.get("hardware"),
+        storage=config.get("storage"),
+    )
+# Create webhooks server (with CI url if in Space and not ephemeral)
+webhooks_server = enable_space_ci_and_return_server(ui=main_block)
+# Add webhooks
+@webhooks_server.add_webhook
+def update_leaderboard(payload: WebhookPayload) -> None:
+    """Redownloads the leaderboard dataset each time it updates"""
+    if payload.repo.type == "dataset" and payload.event.action == "update":
+        global NEW_DATA_ON_LEADERBOARD
+        if NEW_DATA_ON_LEADERBOARD:
+            return
+        NEW_DATA_ON_LEADERBOARD = True
+        for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
+            datasets.load_dataset(
+                repo,
+                "default",
+                cache_dir=HF_HOME,
+                download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
+                verification_mode="no_checks"
+            )
+webhooks_server.launch()
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
+scheduler.start()

app.py CHANGED Viewed

@@ -1,648 +1,178 @@
 import os
-import logging
 import time
-import datetime
-import gradio as gr
-from threading import Thread
-import datasets
-from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 from apscheduler.schedulers.background import BackgroundScheduler
-# Start ephemeral Spaces on PRs (see config in README.md)
-from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
-from src.display.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    # INTRODUCTION_TEXT,
-    TITLE,
-    ABOUT_TEXT,
-    SUBMISSION_TEXT_3,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    fields,
-    EvalQueueColumn
-)
-from src.envs import (
-    API,
-    EVAL_REQUESTS_PATH,
-    RESULT_REPO,
-    DATA_VERSION,
-    DATA_REPO,
-    HARD_RESULT_REPO,
-    ELO_REPO,
-    HARD_ELO_REPO,
-    SOLVE_REPO,
-    HARD_SOLVE_REPO,
-    HF_TOKEN,
-    QUEUE_REPO,
-    REPO_ID,
-    VOTES_REPO,
-    VOTES_PATH,
-    HF_HOME,
-)
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.execute import generate_command, default_command, is_running, lock, stream_logs, find_result_file
-from src.tools.plots import plot_elo_mle, plot_solve_rate
-# from src.voting.vote_system import VoteManager, run_scheduler
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Start ephemeral Spaces on PRs (see config in README.md)
-from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
-# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
-# This controls whether a full initialization should be performed.
-DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
-NEW_DATA_ON_LEADERBOARD = True
-LEADERBOARD_DF = None
-HARD_LEADERBOARD_DF = None
-ELO_TASK_DF = None
-ELO_BENCH_DF = None
-HARD_ELO_TASK_DF = None
-HARD_ELO_BENCH_DF = None
-COMPLETE_SOLVE_DF = None
-INSTRUCT_SOLVE_DF = None
-HARD_COMPLETE_SOLVE_DF = None
-HARD_INSTRUCT_SOLVE_DF = None
-DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
-                             verification_mode="no_checks")
-def filter_data(data, keyword):
-    if not keyword:
-        return data
-    filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
-    return filtered_data
-def update_display(search_keyword, index, show_test):
-    filtered_data = filter_data(DATA, search_keyword)
-    if not filtered_data:
-        return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
-    max_index = len(filtered_data) - 1
-    index = min(max(0, index), max_index)
-    task_id = filtered_data[index]['task_id']
-    snippet1 = filtered_data[index]['complete_prompt']
-    snippet2 = filtered_data[index]['instruct_prompt']
-    # snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
-    snippet4 = filtered_data[index]['test'] if show_test else ""
-    return [
-        task_id,
-        snippet1,
-        snippet2,
-        # snippet3,
-        snippet4,
-        len(filtered_data),
-        gr.update(maximum=max_index, value=index)
-    ]
-def restart_space():
-    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
-def time_diff_wrapper(func):
-    def wrapper(*args, **kwargs):
-        start_time = time.time()
-        result = func(*args, **kwargs)
-        end_time = time.time()
-        diff = end_time - start_time
-        logging.info(f"Time taken for {func.__name__}: {diff} seconds")
-        return result
-    return wrapper
-@time_diff_wrapper
-def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
-    """Download dataset with exponential backoff retries."""
-    attempt = 0
-    while attempt < max_attempts:
         try:
-            logging.info(f"Downloading {repo_id} to {local_dir}")
-            snapshot_download(
-                repo_id=repo_id,
-                local_dir=local_dir,
-                repo_type=repo_type,
-                tqdm_class=None,
-                etag_timeout=30,
-                max_workers=8,
-            )
-            logging.info("Download successful")
-            return
         except Exception as e:
-            wait_time = backoff_factor**attempt
-            logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
-            time.sleep(wait_time)
-            attempt += 1
-    raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
-def get_latest_data_leaderboard(
-    leaderboard_initial_df = None,
-    hard_leaderboard_initial_df = None,
-    elo_task_df = None,
-    elo_bench_df = None,
-    hard_elo_task_df = None,
-    hard_elo_bench_df = None,
-    complete_solve_df = None,
-    instruct_solve_df = None,
-    hard_complete_solve_df = None,
-    hard_instruct_solve_df = None
-    ):
-    global NEW_DATA_ON_LEADERBOARD
-    global LEADERBOARD_DF
-    global HARD_LEADERBOARD_DF
-    global ELO_TASK_DF
-    global ELO_BENCH_DF
-    global HARD_ELO_TASK_DF
-    global HARD_ELO_BENCH_DF
-    global COMPLETE_SOLVE_DF
-    global INSTRUCT_SOLVE_DF
-    global HARD_COMPLETE_SOLVE_DF
-    global HARD_INSTRUCT_SOLVE_DF
-    if NEW_DATA_ON_LEADERBOARD:
-        print("Leaderboard updated at reload!")
-        leaderboard_dataset = datasets.load_dataset(
-            RESULT_REPO,
-            "default",
-            split="train",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        )
-        LEADERBOARD_DF = get_leaderboard_df(
-            leaderboard_dataset=leaderboard_dataset,
-            cols=COLS,
-        )
-        hard_leaderboard_dataset = datasets.load_dataset(
-            HARD_RESULT_REPO,
-            "default",
-            split="train",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        )
-        hard_leaderboard_df = get_leaderboard_df(
-            leaderboard_dataset=hard_leaderboard_dataset,
-            cols=COLS,
-        )
-        HARD_LEADERBOARD_DF = hard_leaderboard_df
-        elo_task_df = datasets.load_dataset(
-            ELO_REPO,
-            "default",
-            split="task_no_tie",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        elo_bench_df = datasets.load_dataset(
-            ELO_REPO,
-            "default",
-            split="benchmark_tie",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        ELO_TASK_DF = elo_task_df
-        ELO_BENCH_DF = elo_bench_df
-        hard_elo_task_df = datasets.load_dataset(
-            HARD_ELO_REPO,
-            "default",
-            split="task_no_tie",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        hard_elo_bench_df = datasets.load_dataset(
-            HARD_ELO_REPO,
-            "default",
-            split="benchmark_tie",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        HARD_ELO_TASK_DF = hard_elo_task_df
-        HARD_ELO_BENCH_DF = hard_elo_bench_df
-        complete_solve_df = datasets.load_dataset(
-            SOLVE_REPO,
-            "default",
-            split="complete",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        instruct_solve_df = datasets.load_dataset(
-            SOLVE_REPO,
-            "default",
-            split="instruct",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        COMPLETE_SOLVE_DF = complete_solve_df
-        INSTRUCT_SOLVE_DF = instruct_solve_df
-        hard_complete_solve_df = datasets.load_dataset(
-            HARD_SOLVE_REPO,
-            "default",
-            split="complete",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        hard_instruct_solve_df = datasets.load_dataset(
-            HARD_SOLVE_REPO,
-            "default",
-            split="instruct",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
-        HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
-        NEW_DATA_ON_LEADERBOARD = False
-    else:
-        LEADERBOARD_DF = leaderboard_initial_df
-        # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
-        ELO_TASK_DF = elo_task_df
-        # ELO_BENCH_DF = elo_bench_df
-        # HARD_ELO_TASK_DF = hard_elo_task_df
-        HARD_ELO_BENCH_DF = hard_elo_bench_df
-        COMPLETE_SOLVE_DF = complete_solve_df
-        # INSTRUCT_SOLVE_DF = instruct_solve_df
-        # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
-        HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
-    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
-    # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
-def init_space():
-    """Initializes the application space, loading only necessary data."""
-    # Always redownload the leaderboard DataFrame
-    global LEADERBOARD_DF
-    global HARD_LEADERBOARD_DF
-    global ELO_TASK_DF
-    global ELO_BENCH_DF
-    global HARD_ELO_TASK_DF
-    global HARD_ELO_BENCH_DF
-    global COMPLETE_SOLVE_DF
-    global INSTRUCT_SOLVE_DF
-    global HARD_COMPLETE_SOLVE_DF
-    global HARD_INSTRUCT_SOLVE_DF
-    LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
-    # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
-    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
-    # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
-# Initialize VoteManager
-# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
-# Schedule the upload_votes method to run every 15 minutes
-# schedule.every(15).minutes.do(vote_manager.upload_votes)
-# Start the scheduler in a separate thread
-# scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
-# scheduler_thread.start()
-# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
-# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
-ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
-COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
-HARD_INSTRUCT_SOLVE_DF = init_space()
-# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
-# Data processing for plots now only on demand in the respective Gradio tab
-# def load_and_create_plots():
-#     plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
-#     return plot_df
-# Function to check if a user is logged in
-def check_login(profile: gr.OAuthProfile | None) -> bool:
-    if profile is None:
-        return False
-    return True
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
-            ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
-            ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
-            ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-        )
-def init_others(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Gradio DataFrame is empty or None.")
-    return gr.Dataframe(dataframe, visible=False)
-main_block = gr.Blocks(css=custom_css)
-with main_block as demo:
-    with gr.Row(elem_id="header-row"):
-        gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
-    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.Tab("💎 Hard Set") as hard_tabs:
-            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
-                hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
-                gr.Markdown(
-                    """
-                **Notes:**
-                - For the efficiency reasons, we only display the Hard Set leaderboard.
-                - _Hard Set_ vs _Full Set_:
-                    - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
-                    - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
-                - _Complete_ vs _Instruct_:
-                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
-                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
-                - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
-                - `Average` is the average of `Complete` and `Instruct` when both are available.
-                - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
-                - `#Act Params (B)` is the number of activated model parameters during inference.
-                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
-                - For more details check the 📝 About section.
-                """,
-                    elem_classes="markdown-text",
-                )
-            with gr.TabItem("📊 Elo Rating", id="hard_elo"):
-                with gr.Column():
-                    with gr.Group():
-                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
-                        hard_task_elo_map = gr.Plot()
-                        hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
-                        demo.load(plot_elo_mle, [hard_elo_task_gr],
-                                    hard_task_elo_map)
-                    with gr.Group():
-                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
-                        hard_bench_elo_map = gr.Plot()
-                        hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
-                        demo.load(plot_elo_mle, [hard_elo_bench_gr],
-                                    hard_bench_elo_map)
-            with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
-                with gr.Column():
-                    hard_complete_map = gr.Plot()
-                    hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
-                    demo.load(plot_solve_rate, [hard_complete_solve_gr,
-                                                gr.Textbox("Complete", visible=False),
-                                                gr.Number(10, visible=False),
-                                                gr.Number(16, visible=False),
-                                                ], hard_complete_map)
-                    hard_instruct_map = gr.Plot()
-                    hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
-                    demo.load(plot_solve_rate, [hard_instruct_solve_gr,
-                                                gr.Textbox("Instruct", visible=False),
-                                                gr.Number(10, visible=False),
-                                                gr.Number(16, visible=False),
-                                                ], hard_instruct_map)
-        with gr.Tab("🎯 Full Set") as full_tabs:
-            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
-                leaderboard = init_leaderboard(LEADERBOARD_DF)
-                gr.Markdown(
-                    """
-                **Notes:**
-                - _Complete_ vs _Instruct_:
-                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
-                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
-                - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
-                - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
-                - `size` is the amount of activated model weight during inference.
-                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
-                - For more details check the 📝 About section.
-                """,
-                    elem_classes="markdown-text",
-                )
-            with gr.TabItem("📊 Elo Rating", id="full_elo"):
-                with gr.Column():
-                    with gr.Group():
-                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
-                        task_elo_map = gr.Plot()
-                        elo_task_gr = init_others(ELO_TASK_DF)
-                        demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
-                    with gr.Group():
-                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
-                        bench_elo_map = gr.Plot()
-                        elo_bench_gr = init_others(ELO_BENCH_DF)
-                        demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
-            with gr.TabItem("🧩 Solve Rate", id="full_solve"):
-                with gr.Column():
-                    complete_map = gr.Plot()
-                    complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
-                    demo.load(plot_solve_rate, [complete_solve_gr,
-                                                gr.Textbox("Complete", visible=False),
-                                                ], complete_map)
-                    instruct_map = gr.Plot()
-                    instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
-                    demo.load(plot_solve_rate, [instruct_solve_gr,
-                                                gr.Textbox("Instruct", visible=False),
-                                                ], instruct_map)
-        with gr.TabItem("📝 About", id=3):
-            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🔎 Data Viewer", id="viewer"):
-            search_input = gr.Textbox(label="Search by keyword")
-            count_output = gr.Number(label="Number of filtered items")
-            index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
-            # show_solution = gr.Checkbox(label="Show Solution")
-            show_test = gr.Checkbox(label="Show Test Cases")
-            update_button = gr.Button("Update")
-            task_id_output = gr.Textbox(label="Task ID")
-            code_completion = gr.Code(language="python", label="Code Completion")
-            nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
-            # solution = gr.Code(language="python", label="Solution")
-            test_cases = gr.Code(language="python", label="Test Cases")
-            update_button.click(
-                update_display,
-                inputs=[search_input, index_slider, show_test],
-                outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
-            )
-            # Initial load
-            demo.load(
-                update_display,
-                inputs=[search_input, index_slider, show_test],
-                outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
-            )
-        with gr.TabItem("🚀 Request", id=4):
-            gr.Markdown(SUBMISSION_TEXT_3)
-        with gr.TabItem("🛠️ Execute", id=5):
-            gr.Markdown("# BigCodeBench Evaluator")
-            with gr.Row():
-                jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
-                split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
-                subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
-            with gr.Row():
-                parallel = gr.Number(label="Parallel (optional)", precision=0)
-                min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
-                max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
-            with gr.Row():
-                max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
-                max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
-                check_gt_only = gr.Checkbox(label="Check GT Only")
-                no_gt = gr.Checkbox(label="No GT")
-            command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
-            with gr.Row():
-                submit_btn = gr.Button("Run Evaluation")
-                download_btn = gr.DownloadButton(label="Download Result")
-            log_output = gr.Textbox(label="Execution Logs", lines=20)
-            input_components = [
-                jsonl_file, split, subset, parallel,
-                min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
-                check_gt_only, no_gt
-            ]
-            for component in input_components:
-                component.change(generate_command, inputs=input_components, outputs=command_output)
-            def start_evaluation(command, jsonl_file, subset, split):
-                extra = subset + "_" if subset != "full" else ""
-                if jsonl_file is not None:
-                    result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
-                else:
-                    result_path = None
-                for log in stream_logs(command, jsonl_file):
-                    if jsonl_file is not None:
-                        yield log, gr.update(value=result_path, label=result_path), gr.update()
-                    else:
-                        yield log, gr.update(), gr.update()
-                is_running = False
-                result_file = find_result_file()
-                if result_file:
-                    return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
-                            # gr.Button(visible=False)#,
-                            # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
-                else:
-                    return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
-                            # gr.Button("Run Evaluation", visible=True),
-                            # gr.DownloadButton(visible=False))
-            submit_btn.click(start_evaluation,
-                        inputs=[command_output, jsonl_file, subset, split],
-                        outputs=[log_output, download_btn])
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-    main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
-    # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
-    # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
-    # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
-main_block.queue(default_concurrency_limit=100)
-def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
-    # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
-    # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
-    # ht to Lucain!
-    if SPACE_ID is None:
-        print("Not in a Space: Space CI disabled.")
-        return WebhooksServer(ui=main_block)
-    if IS_EPHEMERAL_SPACE:
-        print("In an ephemeral Space: Space CI disabled.")
-        return WebhooksServer(ui=main_block)
-    card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
-    config = card.data.get("space_ci", {})
-    print(f"Enabling Space CI with config from README: {config}")
-    return configure_space_ci(
-        blocks=ui,
-        trusted_authors=config.get("trusted_authors"),
-        private=config.get("private", "auto"),
-        variables=config.get("variables", "auto"),
-        secrets=config.get("secrets"),
-        hardware=config.get("hardware"),
-        storage=config.get("storage"),
-    )
-# Create webhooks server (with CI url if in Space and not ephemeral)
-webhooks_server = enable_space_ci_and_return_server(ui=main_block)
-# Add webhooks
-@webhooks_server.add_webhook
-def update_leaderboard(payload: WebhookPayload) -> None:
-    """Redownloads the leaderboard dataset each time it updates"""
-    if payload.repo.type == "dataset" and payload.event.action == "update":
-        global NEW_DATA_ON_LEADERBOARD
-        if NEW_DATA_ON_LEADERBOARD:
-            return
-        NEW_DATA_ON_LEADERBOARD = True
-        for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
-            datasets.load_dataset(
-                repo,
-                "default",
-                cache_dir=HF_HOME,
-                download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
-                verification_mode="no_checks"
-            )
-webhooks_server.launch()
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
-scheduler.start()

+import gradio as gr
+import subprocess
+import sys
 import os
+import threading
 import time
+import uuid
+import glob
+import shutil
+from pathlib import Path
 from apscheduler.schedulers.background import BackgroundScheduler
+default_command = "bigcodebench.evaluate"
+is_running = False
+lock = threading.Lock()
+def generate_command(
+    jsonl_file, split, subset, parallel,
+    min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
+    check_gt_only, no_gt
+):
+    command = [default_command]
+    if jsonl_file is not None:
+        # Copy the uploaded file to the current directory
+        local_filename = os.path.basename(jsonl_file.name)
+        shutil.copy(jsonl_file.name, local_filename)
+        command.extend(["--samples", local_filename])
+    command.extend(["--split", split, "--subset", subset])
+    if parallel is not None and parallel != 0:
+        command.extend(["--parallel", str(int(parallel))])
+    command.extend([
+        "--min-time-limit", str(min_time_limit),
+        "--max-as-limit", str(int(max_as_limit)),
+        "--max-data-limit", str(int(max_data_limit)),
+        "--max-stack-limit", str(int(max_stack_limit))
+    ])
+    if check_gt_only:
+        command.append("--check-gt-only")
+    if no_gt:
+        command.append("--no-gt")
+    return " ".join(command)
+def cleanup_previous_files(jsonl_file):
+    if jsonl_file is not None:
+        file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
+    else:
+        file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
+    for file in glob.glob("*"):
         try:
+            if file not in file_list:
+                os.remove(file)
         except Exception as e:
+            print(f"Error during cleanup of {file}: {e}")
+def find_result_file():
+    json_files = glob.glob("*.json")
+    if json_files:
+        return max(json_files, key=os.path.getmtime)
+    return None
+def run_bigcodebench(command):
+    global is_running
+    with lock:
+        if is_running:
+            yield "A command is already running. Please wait for it to finish.\n"
+            return
+        is_running = True
+    try:
+        yield f"Executing command: {command}\n"
+        process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
+        for line in process.stdout:
+            yield line
+        # process.wait()
+        if process.returncode != 0:
+            yield f"Error: Command exited with status {process.returncode}\n"
+        yield "Evaluation completed.\n"
+        result_file = find_result_file()
+        if result_file:
+            yield f"Result file found: {result_file}\n"
+        else:
+            yield "No result file found.\n"
+    finally:
+        with lock:
+            is_running = False
+def stream_logs(command, jsonl_file=None):
+    global is_running
+    if is_running:
+        yield "A command is already running. Please wait for it to finish.\n"
+        return
+    cleanup_previous_files(jsonl_file)
+    yield "Cleaned up previous files.\n"
+    log_content = []
+    for log_line in run_bigcodebench(command):
+        log_content.append(log_line)
+        yield "".join(log_content)
+with gr.Blocks() as demo:
+    gr.Markdown("# BigCodeBench Evaluator")
+    with gr.Row():
+        jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
+        split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
+        subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
     with gr.Row():
+        parallel = gr.Number(label="Parallel (optional)", precision=0)
+        min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
+        max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
+    with gr.Row():
+        max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
+        max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
+        check_gt_only = gr.Checkbox(label="Check GT Only")
+        no_gt = gr.Checkbox(label="No GT")
+    command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
+    with gr.Row():
+        submit_btn = gr.Button("Run Evaluation")
+        download_btn = gr.DownloadButton(label="Download Result")
+    log_output = gr.Textbox(label="Execution Logs", lines=20)
+    input_components = [
+        jsonl_file, split, subset, parallel,
+        min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
+        check_gt_only, no_gt
+    ]
+    for component in input_components:
+        component.change(generate_command, inputs=input_components, outputs=command_output)
+    def start_evaluation(command, jsonl_file, subset, split):
+        extra = subset + "_" if subset != "full" else ""
+        if jsonl_file is not None:
+            result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
+        else:
+            result_path = None
+        for log in stream_logs(command, jsonl_file):
+            if jsonl_file is not None:
+                yield log, gr.update(value=result_path, label=result_path), gr.update()
+            else:
+                yield log, gr.update(), gr.update()
+        is_running = False
+        result_file = find_result_file()
+        if result_file:
+            return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
+                    # gr.Button(visible=False)#,
+                    # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
+        else:
+            return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
+                    # gr.Button("Run Evaluation", visible=True),
+                    # gr.DownloadButton(visible=False))
+    submit_btn.click(start_evaluation,
+                 inputs=[command_output, jsonl_file, subset, split],
+                 outputs=[log_output, download_btn])
+demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
+scheduler = BackgroundScheduler()

demo.py DELETED Viewed

@@ -1,178 +0,0 @@
-import gradio as gr
-import subprocess
-import sys
-import os
-import threading
-import time
-import uuid
-import glob
-import shutil
-from pathlib import Path
-from apscheduler.schedulers.background import BackgroundScheduler
-default_command = "bigcodebench.evaluate"
-is_running = False
-lock = threading.Lock()
-def generate_command(
-    jsonl_file, split, subset, parallel,
-    min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
-    check_gt_only, no_gt
-):
-    command = [default_command]
-    if jsonl_file is not None:
-        # Copy the uploaded file to the current directory
-        local_filename = os.path.basename(jsonl_file.name)
-        shutil.copy(jsonl_file.name, local_filename)
-        command.extend(["--samples", local_filename])
-    command.extend(["--split", split, "--subset", subset])
-    if parallel is not None and parallel != 0:
-        command.extend(["--parallel", str(int(parallel))])
-    command.extend([
-        "--min-time-limit", str(min_time_limit),
-        "--max-as-limit", str(int(max_as_limit)),
-        "--max-data-limit", str(int(max_data_limit)),
-        "--max-stack-limit", str(int(max_stack_limit))
-    ])
-    if check_gt_only:
-        command.append("--check-gt-only")
-    if no_gt:
-        command.append("--no-gt")
-    return " ".join(command)
-def cleanup_previous_files(jsonl_file):
-    if jsonl_file is not None:
-        file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
-    else:
-        file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
-    for file in glob.glob("*"):
-        try:
-            if file not in file_list:
-                os.remove(file)
-        except Exception as e:
-            print(f"Error during cleanup of {file}: {e}")
-def find_result_file():
-    json_files = glob.glob("*.json")
-    if json_files:
-        return max(json_files, key=os.path.getmtime)
-    return None
-def run_bigcodebench(command):
-    global is_running
-    with lock:
-        if is_running:
-            yield "A command is already running. Please wait for it to finish.\n"
-            return
-        is_running = True
-    try:
-        yield f"Executing command: {command}\n"
-        process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
-        for line in process.stdout:
-            yield line
-        # process.wait()
-        if process.returncode != 0:
-            yield f"Error: Command exited with status {process.returncode}\n"
-        yield "Evaluation completed.\n"
-        result_file = find_result_file()
-        if result_file:
-            yield f"Result file found: {result_file}\n"
-        else:
-            yield "No result file found.\n"
-    finally:
-        with lock:
-            is_running = False
-def stream_logs(command, jsonl_file=None):
-    global is_running
-    if is_running:
-        yield "A command is already running. Please wait for it to finish.\n"
-        return
-    cleanup_previous_files(jsonl_file)
-    yield "Cleaned up previous files.\n"
-    log_content = []
-    for log_line in run_bigcodebench(command):
-        log_content.append(log_line)
-        yield "".join(log_content)
-with gr.Blocks() as demo:
-    gr.Markdown("# BigCodeBench Evaluator")
-    with gr.Row():
-        jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
-        split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
-        subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
-    with gr.Row():
-        parallel = gr.Number(label="Parallel (optional)", precision=0)
-        min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
-        max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
-    with gr.Row():
-        max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
-        max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
-        check_gt_only = gr.Checkbox(label="Check GT Only")
-        no_gt = gr.Checkbox(label="No GT")
-    command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
-    with gr.Row():
-        submit_btn = gr.Button("Run Evaluation")
-        download_btn = gr.DownloadButton(label="Download Result")
-    log_output = gr.Textbox(label="Execution Logs", lines=20)
-    input_components = [
-        jsonl_file, split, subset, parallel,
-        min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
-        check_gt_only, no_gt
-    ]
-    for component in input_components:
-        component.change(generate_command, inputs=input_components, outputs=command_output)
-    def start_evaluation(command, jsonl_file, subset, split):
-        extra = subset + "_" if subset != "full" else ""
-        if jsonl_file is not None:
-            result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
-        else:
-            result_path = None
-        for log in stream_logs(command, jsonl_file):
-            if jsonl_file is not None:
-                yield log, gr.update(value=result_path, label=result_path), gr.update()
-            else:
-                yield log, gr.update(), gr.update()
-        is_running = False
-        result_file = find_result_file()
-        if result_file:
-            return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
-                    # gr.Button(visible=False)#,
-                    # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
-        else:
-            return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
-                    # gr.Button("Run Evaluation", visible=True),
-                    # gr.DownloadButton(visible=False))
-    submit_btn.click(start_evaluation,
-                 inputs=[command_output, jsonl_file, subset, split],
-                 outputs=[log_output, download_btn])
-demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
-scheduler = BackgroundScheduler()