Spaces:

leaderboards
/

LeaderboardsExplorer

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Mar 25

Commit

84b5dfa

•

1 Parent(s): 4b2522c

POC, v0

Browse files

Files changed (7) hide show

app.py +75 -4
requirements.txt +2 -1
src/leaderboards/get_from_hub.py +16 -5
src/leaderboards/saved.py +1 -1
src/static/about.py +29 -26
src/static/display.py +22 -0
src/static/tag_info.py +157 -0

app.py CHANGED Viewed

@@ -1,31 +1,102 @@
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.static.env import API, REPO_ID, HF_TOKEN
-from src.static.about import TITLE, INTRO, ABOUT
 from src.leaderboards.get_from_hub import get_leaderboard_info
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
-leaderboards_to_info, info_to_leaderboards = get_leaderboard_info()
 demo = gr.Blocks()
 with demo:
-    gr.HTML(TITLE)
     gr.Markdown(INTRO, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Search"):
             gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
         with gr.TabItem("About"):
             gr.Markdown(ABOUT, elem_classes="markdown-text")
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
 scheduler.start()

 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.static.env import API, REPO_ID, HF_TOKEN
+from src.static.about import TITLE, INTRO, ABOUT, DOCUMENTATION
 from src.leaderboards.get_from_hub import get_leaderboard_info
+from src.static.tag_info import *
+from src.static.display import make_clickable
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
+LEADERBOARDS_TO_INFO, INFO_TO_LEADERBOARDS = get_leaderboard_info()
+def update_leaderboards(show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags):
+    spaces_of_interest = []
+    if show_all:
+        spaces_of_interest = INFO_TO_LEADERBOARDS["all"]
+    else:
+        for tag in modality_tags:
+            spaces_of_interest.extend(INFO_TO_LEADERBOARDS["modality"][tag.lower()])
+        for tag in submission_tags:
+            spaces_of_interest.extend(INFO_TO_LEADERBOARDS["submission"][tag.lower()])
+        for tag in test_set_tags:
+            spaces_of_interest.extend(INFO_TO_LEADERBOARDS["test"][tag.lower()])
+        for tag in evaluation_tags:
+            spaces_of_interest.extend(INFO_TO_LEADERBOARDS["modality"][tag.lower()])
+        for tag in language_tags:
+            spaces_of_interest.extend(INFO_TO_LEADERBOARDS["language"][tag.lower()])
+    return "- " + "\n - ".join([
+        make_clickable(space) +
+        f"\n*Tags: {', '.join(LEADERBOARDS_TO_INFO[space])}*"
+        for space in spaces_of_interest
+        ])
 demo = gr.Blocks()
 with demo:
+    gr.Markdown(TITLE)
     gr.Markdown(INTRO, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Search"):
             gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
+            with gr.Row():
+                with gr.Column():
+                    show_all = gr.Checkbox(
+                        value=False,
+                        label="Show all leaderboards"
+                    )
+                    modality_tags = gr.CheckboxGroup(
+                        choices=[tag.name for tag in Modality],
+                        value=[],
+                        label="Modality of choice"
+                    )
+                    submission_tags = gr.CheckboxGroup(
+                        choices=[tag.name for tag in SubmissionType],
+                        value=[],
+                        label="Submission type"
+                    )
+                    test_set_tags = gr.CheckboxGroup(
+                        choices=[tag.name for tag in TestSetStatus],
+                        value=[],
+                        label="Test set status"
+                    )
+                with gr.Column():
+                    evaluation_tags = gr.CheckboxGroup(
+                        choices=[tag.name for tag in EvaluationCategory],
+                        value=[],
+                        label="Specific evaluation categories"
+                    )
+                    language_tags = gr.CheckboxGroup(
+                        choices=[tag.capitalize() for tag in sorted(list(INFO_TO_LEADERBOARDS["language"].keys()))],
+                        value=[],
+                        label="Specific languages"
+                    )
+            with gr.Row():
+                leaderboards = gr.Markdown(
+                    value="",
+                )
+            for selector in [show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags]:
+                selector.change(
+                    update_leaderboards,
+                    [show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags],
+                    leaderboards,
+                    queue=True,
+                )
         with gr.TabItem("About"):
             gr.Markdown(ABOUT, elem_classes="markdown-text")
+        with gr.TabItem("Documentation"):
+            gr.Markdown(DOCUMENTATION, elem_classes="markdown-text")
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
 scheduler.start()

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- huggingface_hub


1	+ huggingface_hub
2	+ appscheduler

src/leaderboards/get_from_hub.py CHANGED Viewed

@@ -45,22 +45,33 @@ def get_leaderboard_info() -> tuple[list, dict]:
     saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
     seen_leaderboards = []
-    leaderboard_df = []
     info_to_leaderboard = defaultdict(lambda: defaultdict(list))
     for name, tags in leaderboards + arenas + saved_leaderboards:
         if name in seen_leaderboards:
             continue
         seen_leaderboards.append(name)
         if name in leaderboard_to_tags:
             tags += leaderboard_to_tags[name]
         grouped_tags = group_all_tags(tags)
-        current_info = grouped_tags
-        current_info["name"] = name
-        leaderboard_df.append(current_info)
         for category, tags in grouped_tags.items():
             for tag in tags:
                 info_to_leaderboard[category][tag].append(name)
-    return leaderboard_df, info_to_leaderboard

     saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
     seen_leaderboards = []
+    leaderboard_to_info = defaultdict(list)
     info_to_leaderboard = defaultdict(lambda: defaultdict(list))
     for name, tags in leaderboards + arenas + saved_leaderboards:
+        # If we have a duplicate between the leaderboards from the hub (leaderboards, arena)
+        # and the ones we saved manually, we use the version from the hub
         if name in seen_leaderboards:
             continue
         seen_leaderboards.append(name)
+        # If the model has its own tags, plus the ones we saved, we aggregate them
         if name in leaderboard_to_tags:
             tags += leaderboard_to_tags[name]
         grouped_tags = group_all_tags(tags)
         for category, tags in grouped_tags.items():
             for tag in tags:
                 info_to_leaderboard[category][tag].append(name)
+                leaderboard_to_info[name].append(f"{category}:{tag}")
+    # We pass everything to sets
+    for leaderboard, tags in leaderboard_to_info.items():
+        leaderboard_to_info[leaderboard] = sorted(list(set(tags)))
+    for category, category_dict in info_to_leaderboard.items():
+        for tag, space_list in category_dict.items():
+            info_to_leaderboard[category][tag] = sorted(list(set(space_list)))
+    info_to_leaderboard["all"] = sorted(list(set(seen_leaderboards)))
+    return leaderboard_to_info, info_to_leaderboard

src/leaderboards/saved.py CHANGED Viewed

@@ -11,7 +11,7 @@ leaderboard_to_tags = {
     "mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
     "gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
     "opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
-    "upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", ],
     "BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
     "vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
     "facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],

     "mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
     "gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
     "opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
+    "upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", "language:korean"],
     "BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
     "vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
     "facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],

src/static/about.py CHANGED Viewed

@@ -1,10 +1,12 @@
 TITLE = "# Leaderboard explorer"
 INTRO = """
 Have you ever wondered which leaderboard would be best for your use case?
 """
-ABOUT = """
 If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
 # First step
@@ -21,44 +23,45 @@ tags:
 ## Submission type
 Arenas are not concerned by this category.
-- `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
-- `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
-- `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
-- `submission:closed`: the leaderboard does not accept submissions at the moment
 ## Test set status
 Arenas are not concerned by this category.
-- `test:public`: all the test sets used are public, the evaluations are completely reproducible
-- `test:mix`: some test sets are public and some private
-- `test:private`: all the test sets used are private, the evaluations are hard to game
-- `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
 ## Judges
-- `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
-- `judge:model`: evaluations are run using a model as a judge approach to rate answer
-- `judge:humans`: evaluations are done by humans to rate answer - this is an arena
-- `judge:vibe_check`: evaluations are done manually by one human
 ## Modalities
 Can be any (or several) of the following list:
-- `modality:text`
-- `modality:image`
-- `modality:video`
-- `modality:audio`
-A bit outside of usual modalities
-- `modality:tools`: requires added tool usage - mostly for assistant models
-- `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
 ## Evaluation categories
 Can be any (or several) of the following list:
-- `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
-- `eval:math`
-- `eval:code`
-- `eval:performance`: model performance (speed, energy consumption, ...)
-- `eval:safety`: safety, toxicity, bias evaluations
 ## Language
 You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
 At the moment, we do not support language codes, please use the language name in English.
 """

+from src.static.tag_info import *
 TITLE = "# Leaderboard explorer"
 INTRO = """
 Have you ever wondered which leaderboard would be best for your use case?
 """
+ABOUT = ("""
 If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
 # First step
 ## Submission type
 Arenas are not concerned by this category.
+""" +
+"\n".join([f"- {s.value.key}: {s.value.usage}" for s in SubmissionType]) +
+"""
 ## Test set status
 Arenas are not concerned by this category.
+""" +
+"\n".join([f"- {s.value.key}: {s.value.usage}" for s in TestSetStatus]) +
+"""
 ## Judges
+""" +
+"\n".join([f"- {s.value.key}: {s.value.usage}" for s in Judge]) +
+"""
 ## Modalities
 Can be any (or several) of the following list:
+""" +
+"\n".join([f"- {s.value.key}: {s.value.usage}" for s in Modality]) +
+"""
 ## Evaluation categories
 Can be any (or several) of the following list:
+""" +
+"\n".join([f"- {s.value.key}: {s.value.usage}" for s in EvaluationCategory]) +
+"""
 ## Language
 You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
 At the moment, we do not support language codes, please use the language name in English.
+""")
+DOCUMENTATION = """
+How to create your own leaderboard?
+I'll make an updated documentation page here at some point, but for now, you can check our [demo leaderboard org](https://huggingface.co/demo-leaderboard-backend)!
+You just need to duplicate the front space (and backend if you want to run your leaderboard on spaces compute), copy the datasets to your own org, and edit the env variables.
 """

src/static/display.py ADDED Viewed

	@@ -0,0 +1,22 @@

+def space_html_block(space_info) -> str:
+	url = space_info.url
+	return f"""
+    <article class="">
+	<a href="{url}" class="relative z-0 mx-auto flex flex-col items-center justify-center bg-gradient-to-br p-4 filter from-blue-600 to-blue-600 overflow-hidden hover:brightness-110 h-40 rounded-lg">
+	    <div class="absolute left-0 top-0 h-24 w-1/2 bg-gradient-to-br from-black/20 via-transparent to-transparent"></div>
+		<div class="absolute flex items-center rounded-xl top-2.5 right-4 text-sm"><svg class="mr-1.5 text-white" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg>
+        <span class="text-white">22</span></div>
+		<div class="absolute opacity-60 text-6xl mb-1 drop-shadow-xl">{icons}</div>
+		<h4 class="z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-xl " style="text-shadow: 0px 1px 2px rgba(0, 0, 0, 0.25);">{name}</h4>
+		</a>
+    """
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable(space):
+    link = f"https://huggingface.co/{space}"
+    return model_hyperlink(link, space)

src/static/tag_info.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from enum import Enum
+from dataclasses import dataclass
+@dataclass
+class Tag:
+    key: str
+    name: str # for display
+    usage: str # explains usage
+    icon: str
+class SubmissionType(Enum):
+    automatic = Tag(
+        key="submission:automatic",
+        name="Automatic",
+        usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
+        icon=""
+    )
+    semiautomatic = Tag(
+        key="submission:semiautomatic",
+        name="Semi Automatic",
+        usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
+        icon=""
+    )
+    manual = Tag(
+        key="submission:manual",
+        name="Manual",
+        usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
+        icon=""
+    )
+    closed = Tag(
+        key="submission:closed",
+        name="Closed",
+        usage="the leaderboard does not accept submissions at the moment",
+        icon=""
+    )
+class TestSetStatus(Enum):
+    public = Tag(
+        key="test:public",
+        name="Public",
+        usage="all the test sets used are public, the evaluations are completely reproducible",
+        icon=""
+    )
+    mix = Tag(
+        key="test:mix",
+        name="Mix",
+        usage="some test sets are public and some private",
+        icon=""
+    )
+    private = Tag(
+        key="test:private",
+        name="Private",
+        usage="all the test sets used are private, the evaluations are hard to game",
+        icon=""
+    )
+    rolling = Tag(
+        key="test:rolling",
+        name="Rolling",
+        usage="the test sets used change regularly through time and evaluation scores are refreshed",
+        icon=""
+    )
+class Judge(Enum):
+    public = Tag(
+        key="judge:auto",
+        name="Automatic metric",
+        usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
+        icon=""
+    )
+    model = Tag(
+        key="judge:model",
+        name="Model",
+        usage="evaluations are run using a model as a judge approach to rate answer",
+        icon=""
+    )
+    humans = Tag(
+        key="judge:humans",
+        name="Human",
+        usage="evaluations are done by humans to rate answer - this is an arena",
+        icon=""
+    )
+    vibe_check = Tag(
+        key="judge:vibe_check",
+        name="Vibe check",
+        usage="evaluations are done manually by one or several humans",
+        icon=""
+    )
+class Modality(Enum):
+    text = Tag(
+        key="modality:text",
+        name="Text",
+        usage="",
+        icon=""
+    )
+    image = Tag(
+        key="modality:image",
+        name="Image",
+        usage="",
+        icon=""
+    )
+    audio = Tag(
+        key="modality:audio",
+        name="Audio",
+        usage="",
+        icon=""
+    )
+    video = Tag(
+        key="modality:video",
+        name="Video",
+        usage="",
+        icon=""
+    )
+    tools = Tag(
+        key="modality:tools",
+        name="Tools",
+        usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
+        icon=""
+    )
+    artefacts = Tag(
+        key="modality:artefacts",
+        name="Artefacts",
+        usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
+        icon=""
+    )
+class EvaluationCategory(Enum):
+    generation = Tag(
+        key="eval:generation",
+        name="Generation",
+        usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
+        icon=""
+    )
+    math = Tag(
+        key="eval:math",
+        name="Math",
+        usage="the evaluation tests math abilities",
+        icon=""
+    )
+    code = Tag(
+        key="eval:code",
+        name="Code",
+        usage="the evaluation tests coding capabilities",
+        icon=""
+    )
+    performance = Tag(
+        key="eval:performance",
+        name="Performance",
+        usage="model performance (speed, energy consumption, ...)",
+        icon=""
+    )
+    safety = Tag(
+        key="eval:safety",
+        name="Safety",
+        usage="the evaluation considers safety, toxicity, bias",
+        icon=""
+    )