Spaces:

open-llm-leaderboard
/

comparator

Running

App Files Files Community

albertvillanova HF staff commited on 2 days ago

Commit

611a3ed

•

1 Parent(s): e3edf6d

Fix style

Browse files

Files changed (5) hide show

app.py +43 -14
src/constants.py +1 -1
src/details.py +9 -4
src/hub.py +2 -1
src/results.py +18 -8

app.py CHANGED Viewed

@@ -3,12 +3,27 @@ from functools import partial
 import gradio as gr
 import src.constants as constants
-from src.details import update_subtasks_component, update_load_details_component, load_details_dataframes, \
-    display_details, update_sample_idx_component, clear_details, update_task_description_component, \
-    display_loading_message_for_details
-from src.results import update_load_results_component, \
-    load_results_dataframes, display_results, update_tasks_component, clear_results, \
-    sort_result_paths_per_model, fetch_result_paths, display_loading_message_for_results
 # if __name__ == "__main__":
 result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
@@ -67,7 +82,7 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
             configs = gr.HTML()
         with gr.Tab("Details"):
             details_task = gr.Radio(
-                list(value for value in constants.TASKS.values() if value[1] != "leaderboard_gpqa"),
                 label="Tasks",
                 info="Evaluation tasks to be loaded",
                 interactive=True,
@@ -84,11 +99,7 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
             load_details_btn = gr.Button("Load Details", interactive=False)
             clear_details_btn = gr.Button("Clear Details")
             sample_idx = gr.Number(
-                label="Sample Index",
-                info="Index of the sample to be displayed",
-                value=0,
-                minimum=0,
-                visible=False
             )
             details = gr.HTML()
             details_dataframe_1 = gr.Dataframe(visible=False)
@@ -135,7 +146,16 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
     gr.on(
         triggers=[clear_results_btn.click, clear_configs_btn.click],
         fn=clear_results,
-        outputs=[model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task],
     )
     # DETAILS:
@@ -174,7 +194,16 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
     )
     clear_details_btn.click(
         fn=clear_details,
-        outputs=[model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx],
     )
 demo.launch()

 import gradio as gr
 import src.constants as constants
+from src.details import (
+    clear_details,
+    display_details,
+    display_loading_message_for_details,
+    load_details_dataframes,
+    update_load_details_component,
+    update_sample_idx_component,
+    update_subtasks_component,
+    update_task_description_component,
+)
+from src.results import (
+    clear_results,
+    display_loading_message_for_results,
+    display_results,
+    fetch_result_paths,
+    load_results_dataframes,
+    sort_result_paths_per_model,
+    update_load_results_component,
+    update_tasks_component,
+)
 # if __name__ == "__main__":
 result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
             configs = gr.HTML()
         with gr.Tab("Details"):
             details_task = gr.Radio(
+                [value for value in constants.TASKS.values() if value[1] != "leaderboard_gpqa"],
                 label="Tasks",
                 info="Evaluation tasks to be loaded",
                 interactive=True,
             load_details_btn = gr.Button("Load Details", interactive=False)
             clear_details_btn = gr.Button("Clear Details")
             sample_idx = gr.Number(
+                label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
             )
             details = gr.HTML()
             details_dataframe_1 = gr.Dataframe(visible=False)
     gr.on(
         triggers=[clear_results_btn.click, clear_configs_btn.click],
         fn=clear_results,
+        outputs=[
+            model_id_1,
+            model_id_2,
+            dataframe_1,
+            dataframe_2,
+            load_results_btn,
+            load_configs_btn,
+            results_task,
+            configs_task,
+        ],
     )
     # DETAILS:
     )
     clear_details_btn.click(
         fn=clear_details,
+        outputs=[
+            model_id_1,
+            model_id_2,
+            details_dataframe_1,
+            details_dataframe_2,
+            details_task,
+            subtask,
+            load_details_btn,
+            sample_idx,
+        ],
     )
 demo.launch()

src/constants.py CHANGED Viewed

@@ -70,4 +70,4 @@ TASK_DESCRIPTIONS = {
     "leaderboard_math": "MATH is a compilation of high-school level competition problems gathered from several sources, formatted consistently using Latex for equations and Asymptote for figures. Generations must fit a very specific output format. We keep only level 5 MATH questions and call it MATH Lvl 5.",
     "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
     "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
-}

     "leaderboard_math": "MATH is a compilation of high-school level competition problems gathered from several sources, formatted consistently using Latex for equations and Asymptote for figures. Generations must fit a very specific output format. We keep only level 5 MATH questions and call it MATH Lvl 5.",
     "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
     "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
+}

src/details.py CHANGED Viewed

@@ -67,6 +67,7 @@ def display_details(sample_idx, *dfs):
         return
     # Pop model_name and add it to the column name
     df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
     # Wrap long strings to avoid overflow; e.g. URLs in "doc.Websites visited_NEV_2"
     def wrap(row):
         try:
@@ -78,8 +79,7 @@ def display_details(sample_idx, *dfs):
     df = df.apply(wrap, axis=1)
     # Style
     return (
-        df.style
-        .format(escape="html", na_rep="")
         # .hide(axis="index")
         .to_html()
     )
@@ -100,9 +100,14 @@ def update_sample_idx_component(*dfs):
 def clear_details():
     # model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx
     return (
-        None, None, None, None, None, None,
         gr.Button("Load Details", interactive=False),
-        gr.Number(label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0,visible=False),
     )

         return
     # Pop model_name and add it to the column name
     df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
     # Wrap long strings to avoid overflow; e.g. URLs in "doc.Websites visited_NEV_2"
     def wrap(row):
         try:
     df = df.apply(wrap, axis=1)
     # Style
     return (
+        df.style.format(escape="html", na_rep="")
         # .hide(axis="index")
         .to_html()
     )
 def clear_details():
     # model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx
     return (
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
         gr.Button("Load Details", interactive=False),
+        gr.Number(label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False),
     )

src/hub.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import httpx
 from huggingface_hub import hf_hub_url
 from huggingface_hub.utils import build_hf_headers
-import json
 client = httpx.AsyncClient()

+import json
 import httpx
 from huggingface_hub import hf_hub_url
 from huggingface_hub.utils import build_hf_headers
 client = httpx.AsyncClient()

src/results.py CHANGED Viewed

@@ -20,13 +20,13 @@ def sort_result_paths_per_model(paths):
     d = defaultdict(list)
     for path in paths:
-        model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
         d[model_id].append(path)
     return {model_id: sorted(paths) for model_id, paths in d.items()}
 def update_load_results_component():
-    return (gr.Button("Load", interactive=True), ) * 2
 async def load_results_dataframe(model_id, result_paths_per_model=None):
@@ -45,7 +45,9 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
 async def load_results_dataframes(*model_ids, result_paths_per_model=None):
-    result = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
     return result
@@ -68,7 +70,11 @@ def display_tab(tab, df, task):
                 not row.startswith(f"{tab}.")
                 or row.startswith(f"{tab}.leaderboard.")
                 or row.endswith(".alias")
-                or (not row.startswith(f"{tab}.{task}") if task != "All" else row.startswith(f"{tab}.leaderboard_arc_challenge"))
             )
         ],
         axis="index",
@@ -94,8 +100,11 @@ def update_tasks_component():
 def clear_results():
     # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
     return (
-        None, None, None, None,
-        *(gr.Button("Load", interactive=False), ) * 2,
         *(
             gr.Radio(
                 ["All"] + list(constants.TASKS.values()),
@@ -104,7 +113,8 @@ def clear_results():
                 value="All",
                 visible=False,
             ),
-        ) * 2,
     )
@@ -116,4 +126,4 @@ def highlight_min_max(s):
 def display_loading_message_for_results():
-    return ("<h3 style='text-align: center;'>Loading...</h3>", ) * 2

     d = defaultdict(list)
     for path in paths:
+        model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
         d[model_id].append(path)
     return {model_id: sorted(paths) for model_id, paths in d.items()}
 def update_load_results_component():
+    return (gr.Button("Load", interactive=True),) * 2
 async def load_results_dataframe(model_id, result_paths_per_model=None):
 async def load_results_dataframes(*model_ids, result_paths_per_model=None):
+    result = await asyncio.gather(
+        *[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
+    )
     return result
                 not row.startswith(f"{tab}.")
                 or row.startswith(f"{tab}.leaderboard.")
                 or row.endswith(".alias")
+                or (
+                    not row.startswith(f"{tab}.{task}")
+                    if task != "All"
+                    else row.startswith(f"{tab}.leaderboard_arc_challenge")
+                )
             )
         ],
         axis="index",
 def clear_results():
     # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
     return (
+        None,
+        None,
+        None,
+        None,
+        *(gr.Button("Load", interactive=False),) * 2,
         *(
             gr.Radio(
                 ["All"] + list(constants.TASKS.values()),
                 value="All",
                 visible=False,
             ),
+        )
+        * 2,
     )
 def display_loading_message_for_results():
+    return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2