open_llm_leaderboard_old

Sleeping

App Files Files Community

Clémentine commited on May 29

Commit

4fc3864

•

1 Parent(s): 3ac217c

init - cleaning the code base, plus adding the new system to load from contents

Browse files

Files changed (10) hide show

app.py +28 -50
src/envs.py +1 -13
src/leaderboard/filter_models.py +12 -0
src/leaderboard/read_evals.py +0 -261
src/populate.py +6 -5
src/scripts/update_all_request_files.py +0 -129
src/submission/submit.py +1 -6
src/tools/collections.py +0 -76
src/{scripts → tools}/create_request_file.py +0 -0
src/tools/plots.py +2 -4

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import logging
 import time
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
@@ -30,21 +31,14 @@ from src.display.utils import (
 )
 from src.envs import (
     API,
-    DYNAMIC_INFO_FILE_PATH,
-    DYNAMIC_INFO_PATH,
-    DYNAMIC_INFO_REPO,
     EVAL_REQUESTS_PATH,
-    EVAL_RESULTS_PATH,
     H4_TOKEN,
-    IS_PUBLIC,
     QUEUE_REPO,
     REPO_ID,
-    RESULTS_REPO,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.scripts.update_all_request_files import update_dynamic_files
 from src.submission.submit import add_new_eval
-from src.tools.collections import update_collections
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Configure logging
@@ -101,30 +95,21 @@ def init_space(full_init: bool = True):
         # These downloads only occur on full initialization
         try:
             download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
-            download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
-            download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
         except Exception:
             restart_space()
     # Always retrieve the leaderboard DataFrame
-    raw_data, original_df = get_leaderboard_df(
-        results_path=EVAL_RESULTS_PATH,
-        requests_path=EVAL_REQUESTS_PATH,
-        dynamic_path=DYNAMIC_INFO_FILE_PATH,
         cols=COLS,
         benchmark_cols=BENCHMARK_COLS,
     )
-    if full_init:
-        # Collection update only happens on full initialization
-        update_collections(original_df)
-    leaderboard_df = original_df.copy()
     # Evaluation queue DataFrame retrieval is independent of initialization detail level
     eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-    return leaderboard_df, raw_data, original_df, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
@@ -133,14 +118,14 @@ do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
 # Data processing for plots now only on demand in the respective Gradio tab
-def load_and_create_plots():
-    plot_df = create_plot_df(create_scores_df(raw_data))
-    return plot_df
 demo = gr.Blocks(css=custom_css)
@@ -182,24 +167,24 @@ with demo:
                 bool_checkboxgroup_label="Hide models",
             )
-        with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
-            with gr.Row():
-                with gr.Column():
-                    plot_df = load_and_create_plots()
-                    chart = create_metric_plot_obj(
-                        plot_df,
-                        [AutoEvalColumn.average.name],
-                        title="Average of Top Scores and Human Baseline Over Time (from last update)",
-                    )
-                    gr.Plot(value=chart, min_width=500)
-                with gr.Column():
-                    plot_df = load_and_create_plots()
-                    chart = create_metric_plot_obj(
-                        plot_df,
-                        BENCHMARK_COLS,
-                        title="Top Scores and Human Baseline Over Time (from last update)",
-                    )
-                    gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -219,7 +204,6 @@ with demo:
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
@@ -290,7 +274,6 @@ with demo:
                     base_model_name_textbox,
                     revision_name_textbox,
                     precision,
-                    private,
                     weight_type,
                     model_type,
                 ],
@@ -307,9 +290,4 @@ with demo:
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", hours=3)  # restarted every 3h
-scheduler.add_job(update_dynamic_files, "interval", hours=2)  # launched every 2 hour
-scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

 import logging
 import time
 import gradio as gr
+import datasets
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 )
 from src.envs import (
     API,
     EVAL_REQUESTS_PATH,
+    AGGREGATED_REPO,
     H4_TOKEN,
     QUEUE_REPO,
     REPO_ID,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Configure logging
         # These downloads only occur on full initialization
         try:
             download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         except Exception:
             restart_space()
     # Always retrieve the leaderboard DataFrame
+    leaderboard_dataset = datasets.load_dataset(AGGREGATED_REPO, "default", split="train")
+    leaderboard_df = get_leaderboard_df(
+        leaderboard_dataset=leaderboard_dataset,
         cols=COLS,
         benchmark_cols=BENCHMARK_COLS,
     )
     # Evaluation queue DataFrame retrieval is independent of initialization detail level
     eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    return leaderboard_df, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+leaderboard_df, eval_queue_dfs = init_space(full_init=do_full_init)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
 # Data processing for plots now only on demand in the respective Gradio tab
+#def load_and_create_plots():
+#    plot_df = create_plot_df(create_scores_df(leaderboard_df))
+#    return plot_df
 demo = gr.Blocks(css=custom_css)
                 bool_checkboxgroup_label="Hide models",
             )
+        #with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
+        #    with gr.Row():
+        #        with gr.Column():
+        #            plot_df = load_and_create_plots()
+        #            chart = create_metric_plot_obj(
+        #                plot_df,
+        #                [AutoEvalColumn.average.name],
+        #                title="Average of Top Scores and Human Baseline Over Time (from last update)",
+        #            )
+        #            gr.Plot(value=chart, min_width=500)
+        #        with gr.Column():
+        #            plot_df = load_and_create_plots()
+        #            chart = create_metric_plot_obj(
+        #                plot_df,
+        #                BENCHMARK_COLS,
+        #                title="Top Scores and Human Baseline Over Time (from last update)",
+        #            )
+        #            gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                     base_model_name_textbox,
                     revision_name_textbox,
                     precision,
                     weight_type,
                     model_type,
                 ],
                 show_copy_button=True,
             )
 demo.queue(default_concurrency_limit=40).launch()

src/envs.py CHANGED Viewed

@@ -6,13 +6,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
 REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
 QUEUE_REPO = "open-llm-leaderboard/requests"
-DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
-RESULTS_REPO = "open-llm-leaderboard/results"
-PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
-PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
-IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 HF_HOME = os.getenv("HF_HOME", ".")
@@ -27,12 +21,6 @@ else:
     print("Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
-DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
-DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
-EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
-EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
 PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"

 REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
 QUEUE_REPO = "open-llm-leaderboard/requests"
+AGGREGATED_REPO = "open-llm-leaderboard/contents"
 HF_HOME = os.getenv("HF_HOME", ".")
     print("Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"

src/leaderboard/filter_models.py CHANGED Viewed

@@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
         leaderboard_data.pop(ix)
     return leaderboard_data
 def filter_models_flags(leaderboard_data: list[dict]):
     leaderboard_data = remove_forbidden_models(leaderboard_data)

         leaderboard_data.pop(ix)
     return leaderboard_data
+"""
+def remove_forbidden_models(leaderboard_data):
+    #Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
+    indices_to_remove = []
+    for ix, row in leaderboard_data.iterrows():
+        if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
+            indices_to_remove.append(ix)
+    # Remove the models from the list
+    return leaderboard_data.drop(indices_to_remove)
+"""
 def filter_models_flags(leaderboard_data: list[dict]):
     leaderboard_data = remove_forbidden_models(leaderboard_data)

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,261 +0,0 @@
-import json
-from pathlib import Path
-from json import JSONDecodeError
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import Optional, Dict, List
-from tqdm import tqdm
-from tqdm.contrib.logging import logging_redirect_tqdm
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-@dataclass
-class EvalResult:
-    # Also see src.display.utils.AutoEvalColumn for what will be displayed.
-    eval_name: str  # org_model_precision (uid)
-    full_model: str  # org/model (path on hub)
-    org: Optional[str]
-    model: str
-    revision: str  # commit hash, "" if main
-    results: Dict[str, float]
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original
-    architecture: str = "Unknown"  # From config file
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = ""  # submission date of request file
-    still_on_hub: bool = True
-    is_merge: bool = False
-    not_flagged: bool = False
-    status: str = "FINISHED"
-    # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
-    tags: List[str] = field(default_factory=list)
-    @classmethod
-    def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
-        with open(json_filepath, "r") as fp:
-            data = json.load(fp)
-        config = data.get("config_general", {})
-        precision = Precision.from_str(config.get("model_dtype", "unknown"))
-        org_and_model = config.get("model_name", "").split("/", 1)
-        org = org_and_model[0] if len(org_and_model) > 1 else None
-        model = org_and_model[-1]
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        results = cls.extract_results(data)  # Properly call the method to extract results
-        return cls(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision=config.get("model_sha", ""),
-        )
-    @staticmethod
-    def extract_results(data: Dict) -> Dict[str, float]:
-        """
-        Extract and process benchmark results from a given dict.
-        Parameters:
-        - data (Dict): A dictionary containing benchmark data. This dictionary must
-        include 'versions' and 'results' keys with respective sub-data.
-        Returns:
-        - Dict[str, float]: A dictionary where keys are benchmark names and values
-        are the processed average scores as percentages.
-        Notes:
-        - The method specifically checks for certain benchmark names to skip outdated entries.
-        - Handles NaN values by setting the corresponding benchmark result to 0.0.
-        - Averages scores across metrics for benchmarks found in the data, in a percentage format.
-        """
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We skip old mmlu entries
-            if task.benchmark == "hendrycksTest":
-                for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
-                    if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
-                        continue
-            # Some benchamrk values are NaNs, mostly truthfulQA
-            # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
-            # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
-            for k, v in data["results"].items():
-                if task.benchmark in k:
-                    if math.isnan(float(v[task.metric])):
-                        results[task.benchmark] = 0.0
-                        continue
-            # We average all scores of a given metric (mostly for mmlu)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return results
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it."""
-        try:
-            request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-            if request_file is None:
-                logging.warning(f"No request file for {self.org}/{self.model}")
-                self.status = "FAILED"
-                return
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.num_params = int(request.get("params", 0))  # Ensuring type safety
-            self.date = request.get("submitted_time", "")
-            self.architecture = request.get("architectures", "Unknown")
-            self.status = request.get("status", "FAILED")
-        except FileNotFoundError:
-            self.status = "FAILED"
-            logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
-        except JSONDecodeError:
-            self.status = "FAILED"
-            logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
-        except KeyError as e:
-            self.status = "FAILED"
-            logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
-        except Exception as e:  # Catch-all for any other unexpected exceptions
-            self.status = "FAILED"
-            logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
-    def update_with_dynamic_file_dict(self, file_dict):
-        """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
-        # Default values set for optional or potentially missing keys.
-        self.license = file_dict.get("license", "?")
-        self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
-        self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
-        self.tags = file_dict.get("tags", [])
-        # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
-        self.not_flagged = not (any("flagged" in tag for tag in self.tags))
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.fullname.name: self.full_model,
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-            AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
-            AutoEvalColumn.moe.name: not (
-                ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
-            ),
-            AutoEvalColumn.not_flagged.name: self.not_flagged,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    requests_path = Path(requests_path)
-    pattern = f"{model_name}_eval_request_*.json"
-    # Using pathlib to find files matching the pattern
-    request_files = list(requests_path.glob(pattern))
-    # Sort the files by name in descending order to mimic 'reverse=True'
-    request_files.sort(reverse=True)
-    # Select the correct request file based on 'status' and 'precision'
-    request_file = None
-    for request_file in request_files:
-        with request_file.open("r") as f:
-            req_content = json.load(f)
-            if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
-                request_file = str(request_file)
-    # Return empty string if no file found that matches criteria
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    with open(dynamic_path) as f:
-        dynamic_data = json.load(f)
-    results_path = Path(results_path)
-    model_files = list(results_path.rglob("results_*.json"))
-    model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
-    eval_results = {}
-    # Wrap model_files iteration with tqdm for progress display
-    for model_result_filepath in tqdm(model_files, desc="Processing model files"):
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        with logging_redirect_tqdm():
-            eval_result.update_with_request_file(requests_path)
-        if eval_result.full_model in dynamic_data:
-            eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
-            # Hardcoding because of gating problem
-            if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
-                eval_result.still_on_hub = True
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for k, v in eval_results.items():
-        try:
-            if v.status == "FINISHED":
-                v.to_dict()  # we test if the dict version is complete
-                results.append(v)
-        except KeyError as e:
-            logging.error(f"Error while checking model {k} {v.date} json, no key: {e}")  # not all eval values present
-            continue
-    return results

src/populate.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import pathlib
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
-from src.leaderboard.read_evals import get_raw_eval_results
 from src.display.utils import load_json_data
@@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols):
     return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
-def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmark_cols):
     """Retrieve and process leaderboard data."""
-    raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
-    all_data_json = [model.to_dict() for model in raw_data] + [baseline_row]
     filter_models_flags(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df

 import pathlib
 import pandas as pd
+from datasets import Dataset
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
 from src.display.utils import load_json_data
     return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
+def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
     """Retrieve and process leaderboard data."""
+    all_data_json = leaderboard_dataset.to_dict()
+    num_items = leaderboard_dataset.num_rows
+    all_data_json = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
     filter_models_flags(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return df

src/scripts/update_all_request_files.py DELETED Viewed

@@ -1,129 +0,0 @@
-import json
-import os
-import time
-from huggingface_hub import snapshot_download
-from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
-from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
-def update_one_model(model_id, data, models_on_the_hub):
-    # Model no longer on the hub at all
-    if model_id not in models_on_the_hub:
-        data["still_on_hub"] = False
-        data["likes"] = 0
-        data["downloads"] = 0
-        data["created_at"] = ""
-        data["tags"] = []
-        return data
-    # Grabbing model parameters
-    model_cfg = models_on_the_hub[model_id]
-    data["likes"] = model_cfg.likes
-    data["downloads"] = model_cfg.downloads
-    data["created_at"] = str(model_cfg.created_at)
-    data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
-    # Grabbing model details
-    model_name = model_id
-    if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
-        if isinstance(model_cfg.card_data.base_model, str):
-            model_name = model_cfg.card_data.base_model  # for adapters, we look at the parent model
-    still_on_hub, _, _ = is_model_on_hub(
-        model_name=model_name,
-        revision=data.get("revision"),
-        trust_remote_code=True,
-        test_tokenizer=False,
-        token=H4_TOKEN,
-    )
-    # If the model doesn't have a model card or a license, we consider it's deleted
-    if still_on_hub:
-        try:
-            status, _, model_card = check_model_card(model_id)
-            if status is False:
-                still_on_hub = False
-        except Exception:
-            model_card = None
-            still_on_hub = False
-    data["still_on_hub"] = still_on_hub
-    tags = get_model_tags(model_card, model_id) if still_on_hub else []
-    data["tags"] = tags
-    return data
-def update_models(file_path, models_on_the_hub):
-    """
-    Search through all JSON files in the specified root folder and its subfolders,
-    and update the likes key in JSON dict from value of input dict
-    """
-    seen_models = []
-    with open(file_path, "r") as f:
-        model_infos = json.load(f)
-        for model_id in model_infos.keys():
-            seen_models.append(model_id)
-            model_infos[model_id] = update_one_model(
-                model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
-            )
-    # If new requests files have been created since we started all this
-    # we grab them
-    all_models = []
-    try:
-        for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
-            if ix == 0:
-                continue
-            for file in files:
-                if "eval_request" in file:
-                    path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
-                    all_models.append(path)
-    except Exception as e:
-        print(e)
-        pass
-    for model_id in all_models:
-        if model_id not in seen_models:
-            model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
-    with open(file_path, "w") as f:
-        json.dump(model_infos, f, indent=2)
-def update_dynamic_files():
-    """This will only update metadata for models already linked in the repo, not add missing ones."""
-    snapshot_download(
-        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
-    print("UPDATE_DYNAMIC: Loaded snapshot")
-    # Get models
-    start = time.time()
-    models = list(
-        API.list_models(
-            # filter=ModelFilter(task="text-generation"),
-            full=False,
-            cardData=True,
-            fetch_config=True,
-        )
-    )
-    id_to_model = {model.id: model for model in models}
-    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
-    start = time.time()
-    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
-    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
-    API.upload_file(
-        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
-        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
-        repo_id=DYNAMIC_INFO_REPO,
-        repo_type="dataset",
-        commit_message="Daily request file update.",
-    )
-    print("UPDATE_DYNAMIC: pushed to hub")

src/submission/submit.py CHANGED Viewed

@@ -7,9 +7,6 @@ from huggingface_hub import snapshot_download
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import (
     API,
-    DYNAMIC_INFO_FILE_PATH,
-    DYNAMIC_INFO_PATH,
-    DYNAMIC_INFO_REPO,
     EVAL_REQUESTS_PATH,
     H4_TOKEN,
     QUEUE_REPO,
@@ -35,7 +32,6 @@ def add_new_eval(
     base_model: str,
     revision: str,
     precision: str,
-    private: bool,
     weight_type: str,
     model_type: str,
 ):
@@ -126,7 +122,6 @@ def add_new_eval(
         "model": model,
         "base_model": base_model,
         "revision": model_info.sha, # force to use the exact model commit
-        "private": private,
         "precision": precision,
         "params": model_size,
         "architectures": architecture,
@@ -154,7 +149,7 @@ def add_new_eval(
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import (
     API,
     EVAL_REQUESTS_PATH,
     H4_TOKEN,
     QUEUE_REPO,
     base_model: str,
     revision: str,
     precision: str,
     weight_type: str,
     model_type: str,
 ):
         "model": model,
         "base_model": base_model,
         "revision": model_info.sha, # force to use the exact model commit
         "precision": precision,
         "params": model_size,
         "architectures": architecture,
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

src/tools/collections.py DELETED Viewed

@@ -1,76 +0,0 @@
-import pandas as pd
-from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
-from huggingface_hub.utils._errors import HfHubHTTPError
-from pandas import DataFrame
-from src.display.utils import AutoEvalColumn, ModelType
-from src.envs import H4_TOKEN, PATH_TO_COLLECTION
-# Specific intervals for the collections
-intervals = {
-    "1B": pd.Interval(0, 1.5, closed="right"),
-    "3B": pd.Interval(2.5, 3.5, closed="neither"),
-    "7B": pd.Interval(6, 8, closed="neither"),
-    "13B": pd.Interval(10, 14, closed="neither"),
-    "30B": pd.Interval(25, 35, closed="neither"),
-    "65B": pd.Interval(60, 70, closed="neither"),
-}
-def _filter_by_type_and_size(df, model_type, size_interval):
-    """Filter DataFrame by model type and parameter size interval."""
-    type_emoji = model_type.value.symbol[0]
-    filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
-    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
-    mask = params_column.apply(lambda x: x in size_interval)
-    return filtered_df.loc[mask]
-def _add_models_to_collection(collection, models, model_type, size):
-    """Add best models to the collection and update positions."""
-    cur_len_collection = len(collection.items)
-    for ix, model in enumerate(models, start=1):
-        try:
-            collection = add_collection_item(
-                PATH_TO_COLLECTION,
-                item_id=model,
-                item_type="model",
-                exists_ok=True,
-                note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
-                token=H4_TOKEN,
-            )
-            # Ensure position is correct if item was added
-            if len(collection.items) > cur_len_collection:
-                item_object_id = collection.items[-1].item_object_id
-                update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
-                cur_len_collection = len(collection.items)
-            break  # assuming we only add the top model
-        except HfHubHTTPError:
-            continue
-def update_collections(df: DataFrame):
-    """Update collections by filtering and adding the best models."""
-    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
-    cur_best_models = []
-    for model_type in ModelType:
-        if not model_type.value.name:
-            continue
-        for size, interval in intervals.items():
-            filtered_df = _filter_by_type_and_size(df, model_type, interval)
-            best_models = list(
-                filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
-            )
-            print(model_type.value.symbol, size, best_models)
-            _add_models_to_collection(collection, best_models, model_type, size)
-            cur_best_models.extend(best_models)
-    # Cleanup
-    existing_models = {item.item_id for item in collection.items}
-    to_remove = existing_models - set(cur_best_models)
-    for item_id in to_remove:
-        try:
-            delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
-        except HfHubHTTPError:
-            continue

src/{scripts → tools}/create_request_file.py RENAMED Viewed

File without changes

src/tools/plots.py CHANGED Viewed

@@ -6,10 +6,9 @@ from plotly.graph_objs import Figure
 from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
 from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS
-from src.leaderboard.read_evals import EvalResult
-def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
     Generates a DataFrame containing the maximum scores until each date.
@@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     :return: A new DataFrame containing the maximum scores until each date for every metric.
     """
     # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
-    results_df = pd.DataFrame(raw_data)
-    # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary

 from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
 from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS
+def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
     """
     Generates a DataFrame containing the maximum scores until each date.
     :return: A new DataFrame containing the maximum scores until each date for every metric.
     """
     # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
+    results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary