open_llm_leaderboard_old

Sleeping

App Files Files Community

clefourrier HF staff

alozowski HF staff commited on Apr 16

Commit

6b87e28

•

1 Parent(s): 8ff5577

dataframe-improvement (#671)

Browse files

- Updated init_space() mostly (e34e357137b1ac54e7f2db292b77c14d4c7cf0ed)
- Updated collections.py (2293858bb036fc9f69040d0210b6db1678b7bdf9)
- Updated populate.py (6b9cbbe716f8f1b4c4d5c3925fbc1d1c27381b5f)
- Updated gitignore (122c7afd045b064431b1ae27c3c543c9dbd1a482)
- bugfix and populate refactoring (2e74c81428ac062c254bc55b88eadf06d877f532)
- updated utils.py (f073c67652ed110738fb31ccb2abf2dd2c2b5156)
- removed comments from populate.py (79ad1ade160afd2ba0f95bd1dec9e8534121f132)
- fixing envs CACHE_PATH check (63dac32758e6a31233c5c57913eda3b53e53c266)
- debugging CACHE_PATH in envs.py (6a5081fbccfd95fb301ba4d8cb446e2c101b337c)
- debugging CACHE_PATH in envs.py (e243a5f654ee69c2a62cb3dd438a7dedc3631c22)
- debugging CACHE_PATH in envs.py (5a8f7dc96273e99cd0895dc13e4a4e476c1eb629)
- small fixed (d8bf61b20d803025270c3395b0b0bf1d68af5576)

Co-authored-by: Alina Lozovskaya <[email protected]>

Files changed (7) hide show

.gitignore +5 -0
.python-version +0 -1
app.py +39 -45
src/display/utils.py +11 -1
src/envs.py +15 -4
src/populate.py +38 -51
src/tools/collections.py +48 -53

.gitignore CHANGED Viewed

@@ -1,10 +1,15 @@
 venv/
 __pycache__/
 .env
 .ipynb_checkpoints
 *ipynb
 .vscode/
 .DS_Store
 eval-queue/
 eval-results/

 venv/
+.venv/
 __pycache__/
 .env
 .ipynb_checkpoints
 *ipynb
 .vscode/
 .DS_Store
+.ruff_cache/
+.python-version
+.profile_app.python
+*pstats
 eval-queue/
 eval-results/

.python-version DELETED Viewed

	@@ -1 +0,0 @@
1	- 3.10.0

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -47,6 +48,7 @@ from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Start ephemeral Spaces on PRs (see config in README.md)
 enable_space_ci()
@@ -55,44 +57,34 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
-def init_space(full_init: bool = True):
-    if full_init:
-        try:
-            print(EVAL_REQUESTS_PATH)
-            snapshot_download(
-                repo_id=QUEUE_REPO,
-                local_dir=EVAL_REQUESTS_PATH,
-                repo_type="dataset",
-                tqdm_class=None,
-                etag_timeout=30,
-                max_workers=8,
-            )
-        except Exception:
-            restart_space()
         try:
-            print(DYNAMIC_INFO_PATH)
             snapshot_download(
-                repo_id=DYNAMIC_INFO_REPO,
-                local_dir=DYNAMIC_INFO_PATH,
-                repo_type="dataset",
                 tqdm_class=None,
                 etag_timeout=30,
                 max_workers=8,
             )
-        except Exception:
-            restart_space()
-        try:
-            print(EVAL_RESULTS_PATH)
-            snapshot_download(
-                repo_id=RESULTS_REPO,
-                local_dir=EVAL_RESULTS_PATH,
-                repo_type="dataset",
-                tqdm_class=None,
-                etag_timeout=30,
-                max_workers=8,
-            )
-        except Exception:
-            restart_space()
     raw_data, original_df = get_leaderboard_df(
         results_path=EVAL_RESULTS_PATH,
@@ -101,18 +93,12 @@ def init_space(full_init: bool = True):
         cols=COLS,
         benchmark_cols=BENCHMARK_COLS,
     )
-    update_collections(original_df.copy())
     leaderboard_df = original_df.copy()
-    plot_df = create_plot_df(create_scores_df(raw_data))
-    (
-        finished_eval_queue_df,
-        running_eval_queue_df,
-        pending_eval_queue_df,
-    ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-    return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
@@ -121,9 +107,14 @@ do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = (
-    init_space(full_init=do_full_init)
-)
 # Searching and filtering
@@ -406,6 +397,7 @@ with demo:
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
                     chart = create_metric_plot_obj(
                         plot_df,
                         [AutoEvalColumn.average.name],
@@ -413,12 +405,14 @@ with demo:
                     )
                     gr.Plot(value=chart, min_width=500)
                 with gr.Column():
                     chart = create_metric_plot_obj(
                         plot_df,
                         BENCHMARK_COLS,
                         title="Top Scores and Human Baseline Over Time (from last update)",
                     )
                     gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

 import os
+import logging
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.tools.collections import update_collections
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Start ephemeral Spaces on PRs (see config in README.md)
 enable_space_ci()
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
+    """Attempt to download dataset with retries."""
+    attempt = 0
+    while attempt < max_attempts:
         try:
+            print(f"Downloading {repo_id} to {local_dir}")
             snapshot_download(
+                repo_id=repo_id,
+                local_dir=local_dir,
+                repo_type=repo_type,
                 tqdm_class=None,
                 etag_timeout=30,
                 max_workers=8,
             )
+            return
+        except Exception as e:
+            logging.error(f"Error downloading {repo_id}: {e}")
+            attempt += 1
+            if attempt == max_attempts:
+                restart_space()
+def init_space(full_init: bool = True):
+    """Initializes the application space, loading only necessary data."""
+    if full_init:
+        download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
+        download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
+        download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
     raw_data, original_df = get_leaderboard_df(
         results_path=EVAL_RESULTS_PATH,
         cols=COLS,
         benchmark_cols=BENCHMARK_COLS,
     )
+    update_collections(original_df)
     leaderboard_df = original_df.copy()
+    eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    return leaderboard_df, raw_data, original_df, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
+finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
+# Data processing for plots now only on demand in the respective Gradio tab
+def load_and_create_plots():
+    plot_df = create_plot_df(create_scores_df(raw_data))
+    return plot_df
 # Searching and filtering
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
+                    plot_df = load_and_create_plots()
                     chart = create_metric_plot_obj(
                         plot_df,
                         [AutoEvalColumn.average.name],
                     )
                     gr.Plot(value=chart, min_width=500)
                 with gr.Column():
+                    plot_df = load_and_create_plots()
                     chart = create_metric_plot_obj(
                         plot_df,
                         BENCHMARK_COLS,
                         title="Top Scores and Human Baseline Over Time (from last update)",
                     )
                     gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

src/display/utils.py CHANGED Viewed

@@ -1,9 +1,19 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import pandas as pd
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]

 from dataclasses import dataclass, make_dataclass
 from enum import Enum
+import json
 import pandas as pd
+def load_json_data(file_path):
+    """Safely load JSON data from a file."""
+    try:
+        with open(file_path, "r") as file:
+            return json.load(file)
+    except json.JSONDecodeError:
+        print(f"Error reading JSON from {file_path}")
+        return None  # Or raise an exception
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]

src/envs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from huggingface_hub import HfApi
@@ -15,11 +16,21 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
-CACHE_PATH = os.getenv("HF_HOME", ".")
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-DYNAMIC_INFO_PATH = os.path.join(CACHE_PATH, "dynamic-info")
 DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"

 import os
+import logging
 from huggingface_hub import HfApi
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
+HF_HOME = os.getenv("HF_HOME", ".")
+# Check HF_HOME write access
+print(f"Initial HF_HOME set to: {HF_HOME}")
+if not os.access(HF_HOME, os.W_OK):
+    print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
+    HF_HOME = "."
+    os.environ["HF_HOME"] = HF_HOME
+else:
+    print(f"Write access confirmed for HF_HOME")
+EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
+DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
 DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"

src/populate.py CHANGED Viewed

@@ -1,68 +1,55 @@
 import json
 import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(
-    results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list
-) -> pd.DataFrame:
-    raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    all_data_json.append(baseline_row)
-    print([data for data in all_data_json if data["model_name_for_query"] == "databricks/dbrx-base"])
     filter_models_flags(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
-    print(df.columns)
-    print(df[df["model_name_for_query"] == "databricks/dbrx-base"])
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    try:
-                        data = json.load(fp)
-                    except json.JSONDecodeError:
-                        print(f"Error reading {file_path}")
-                        continue
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 import json
 import os
+import pathlib
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
 from src.leaderboard.read_evals import get_raw_eval_results
+from src.display.utils import load_json_data
+def _process_model_data(entry, model_name_key="model", revision_key="revision"):
+    """Enrich model data with clickable links and revisions."""
+    entry[EvalQueueColumn.model.name] = make_clickable_model(entry.get(model_name_key, ""))
+    entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
+    return entry
+def get_evaluation_queue_df(save_path, cols):
+    """Generate dataframes for pending, running, and finished evaluation entries."""
+    save_path = pathlib.Path(save_path)
+    all_evals = []
+    for path in save_path.rglob('*.json'):
+        data = load_json_data(path)
+        if data:
+            all_evals.append(_process_model_data(data))
+    # Organizing data by status
+    status_map = {
+        "PENDING": ["PENDING", "RERUN"],
+        "RUNNING": ["RUNNING"],
+        "FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
+    }
+    status_dfs = {status: [] for status in status_map}
+    for eval_data in all_evals:
+        for status, extra_statuses in status_map.items():
+            if eval_data["status"] in extra_statuses:
+                status_dfs[status].append(eval_data)
+    return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
+def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmark_cols):
+    """Retrieve and process leaderboard data."""
+    raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
+    all_data_json = [model.to_dict() for model in raw_data] + [baseline_row]
     filter_models_flags(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df

src/tools/collections.py CHANGED Viewed

@@ -17,65 +17,60 @@ intervals = {
 }
-def update_collections(df: DataFrame):
-    """This function updates the Open LLM Leaderboard model collection with the latest best models for
-    each size category and type.
-    """
-    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
-    cur_best_models = []
-    ix = 0
-    for type in ModelType:
-        if type.value.name == "":
             continue
-        for size in intervals:
-            # We filter the df to gather the relevant models
-            type_emoji = [t[0] for t in type.value.symbol]
-            filtered_df = df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
-            numeric_interval = pd.IntervalIndex([intervals[size]])
-            mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
-            filtered_df = filtered_df.loc[mask]
             best_models = list(
-                filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name]
             )
-            print(type.value.symbol, size, best_models[:10])
-            # We add them one by one to the leaderboard
-            for model in best_models:
-                ix += 1
-                cur_len_collection = len(collection.items)
-                try:
-                    collection = add_collection_item(
-                        PATH_TO_COLLECTION,
-                        item_id=model,
-                        item_type="model",
-                        exists_ok=True,
-                        note=f"Best {type.to_str(' ')} model of around {size} on the leaderboard today!",
-                        token=H4_TOKEN,
-                    )
-                    if (
-                        len(collection.items) > cur_len_collection
-                    ):  # we added an item - we make sure its position is correct
-                        item_object_id = collection.items[-1].item_object_id
-                        update_collection_item(
-                            collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix
-                        )
-                        cur_len_collection = len(collection.items)
-                    cur_best_models.append(model)
-                    break
-                except HfHubHTTPError:
-                    continue
-    collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
-    for item in collection.items:
-        if item.item_id not in cur_best_models:
-            try:
-                delete_collection_item(
-                    collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
-                )
-            except HfHubHTTPError:
-                continue

 }
+def _filter_by_type_and_size(df, model_type, size_interval):
+    """Filter DataFrame by model type and parameter size interval."""
+    type_emoji = model_type.value.symbol[0]
+    filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
+    mask = params_column.apply(lambda x: x in size_interval)
+    return filtered_df.loc[mask]
+def _add_models_to_collection(collection, models, model_type, size):
+    """Add best models to the collection and update positions."""
+    cur_len_collection = len(collection.items)
+    for ix, model in enumerate(models, start=1):
+        try:
+            collection = add_collection_item(
+                PATH_TO_COLLECTION,
+                item_id=model,
+                item_type="model",
+                exists_ok=True,
+                note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
+                token=H4_TOKEN,
+            )
+            # Ensure position is correct if item was added
+            if len(collection.items) > cur_len_collection:
+                item_object_id = collection.items[-1].item_object_id
+                update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
+                cur_len_collection = len(collection.items)
+            break  # assuming we only add the top model
+        except HfHubHTTPError:
             continue
+def update_collections(df: DataFrame):
+    """Update collections by filtering and adding the best models."""
+    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
+    cur_best_models = []
+    for model_type in ModelType:
+        if not model_type.value.name:
+            continue
+        for size, interval in intervals.items():
+            filtered_df = _filter_by_type_and_size(df, model_type, interval)
             best_models = list(
+                filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name][:10]
             )
+            print(model_type.value.symbol, size, best_models)
+            _add_models_to_collection(collection, best_models, model_type, size)
+            cur_best_models.extend(best_models)
+    # Cleanup
+    existing_models = {item.item_id for item in collection.items}
+    to_remove = existing_models - set(cur_best_models)
+    for item_id in to_remove:
+        try:
+            delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
+        except HfHubHTTPError:
+            continue