leaderboard

Running on CPU Upgrade

App Files Files Community

Quentin Gallouédec commited on Apr 7

Commit

69cf5b3

•

1 Parent(s): de52ad3

new version

Browse files

Files changed (3) hide show

app.py +114 -101
src/envs.py +0 -33
src/evaluation.py +3 -2

app.py CHANGED Viewed

@@ -1,32 +1,45 @@
-import glob
 import json
 import os
 import pprint
 import gradio as gr
 import numpy as np
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
 from src.css_html_js import dark_mode_gradio_js
-from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
-from src.evaluation import ALL_ENV_IDS, evaluate
 from src.logging import configure_root_logger, setup_logger
 configure_root_logger()
 logger = setup_logger(__name__)
-pp = pprint.PrettyPrinter(width=80)
-def model_hyperlink(link, model_id):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
-def make_clickable_model(model_id):
-    link = f"https://huggingface.co/{model_id}"
-    return model_hyperlink(link, model_id)
 def _backend_routine():
@@ -42,55 +55,51 @@ def _backend_routine():
     logger.info(f"Found {len(compatible_models)} compatible models")
     # Get the results
-    snapshot_download(
-        repo_id=RESULTS_REPO,
-        revision="main",
-        local_dir=RESULTS_PATH,
-        repo_type="dataset",
-        max_workers=60,
-        token=TOKEN,
-    )
-    json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
     evaluated_models = set()
-    for json_filepath in json_files:
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))
     # Find the models that are not associated with any results
     pending_models = set(compatible_models) - evaluated_models
     logger.info(f"Found {len(pending_models)} pending models")
     # Run an evaluation on the models
-    for model_id, sha in pending_models:
-        logger.info(f"Running evaluation on {model_id}")
-        report = {"config": {"model_id": model_id, "model_sha": sha}}
-        try:
-            evaluations = evaluate(model_id, revision=sha)
-        except Exception as e:
-            logger.error(f"Error evaluating {model_id}: {e}")
-            evaluations = None
-        if evaluations is not None:
-            report["results"] = evaluations
-            report["status"] = "DONE"
-        else:
-            report["status"] = "FAILED"
-        # Update the results
-        dumped = json.dumps(report, indent=2)
-        output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        with open(output_path, "w") as f:
-            f.write(dumped)
-        # Upload the results to the results repo
-        API.upload_file(
-            path_or_fileobj=output_path,
-            path_in_repo=f"{model_id}/results_{sha}.json",
-            repo_id=RESULTS_REPO,
-            repo_type="dataset",
         )
@@ -102,32 +111,27 @@ def backend_routine():
 def get_leaderboard_df():
-    snapshot_download(
-        repo_id=RESULTS_REPO,
-        revision="main",
-        local_dir=RESULTS_PATH,
-        repo_type="dataset",
-        max_workers=60,
-        token=TOKEN,
-    )
-    json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
-    data = []
-    for json_filepath in json_files:
-        with open(json_filepath) as fp:
             report = json.load(fp)
-        model_id = report["config"]["model_id"]
-        row = {"Agent": model_id, "Status": report["status"]}
-        if report["status"] == "DONE":
-            results = {env_id: np.mean(result["episodic_return"]) for env_id, result in report["results"].items()}
-            row.update(results)
         data.append(row)
-    # Create DataFrame
-    df = pd.DataFrame(data)
-    # Replace NaN values with empty strings
-    df = df.fillna("")
     return df
@@ -144,39 +148,48 @@ The Open RL Leaderboard is a community-driven benchmark for reinforcement learni
 """
-def select_column(column_name: str, data: pd.DataFrame):
-    # column_names = [col for col in column_names if col in data.columns]
-    column_names = ["Agent"] + [column_name]  # add model name column
-    df = data[column_names]
-    def check_row(row):
-        return not (row.drop("Agent") == "").all()
-    mask = df.apply(check_row, axis=1)
-    df = df[mask]
-    df = df.sort_values(by=column_name, ascending=False)
-    return df
 with gr.Blocks(js=dark_mode_gradio_js) as demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
-            hidden_df = gr.components.Dataframe(get_leaderboard_df, visible=False, every=5 * 60)  # hidden dataframe
-            env_selector = gr.components.Dropdown(
-                label="Environments",
-                choices=ALL_ENV_IDS,
-                value=ALL_ENV_IDS[0],
-                # interactive=True,
-            )
-            leaderboard = gr.components.Dataframe(select_column(ALL_ENV_IDS[0], get_leaderboard_df()))
-            # Events
-            env_selector.change(select_column, [env_selector, hidden_df], leaderboard)
-            # Update hidden dataframe
-            # hidden_df.change(get_leaderboard_df, [], hidden_df, every=10)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(ABOUT_TEXT)
@@ -188,4 +201,4 @@ scheduler.start()
 if __name__ == "__main__":
-    demo.queue().launch()  # server_name="0.0.0.0", show_error=True, server_port=7860)

 import json
 import os
 import pprint
+import re
+import tempfile
 import gradio as gr
 import numpy as np
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
 from src.css_html_js import dark_mode_gradio_js
+from src.evaluation import evaluate
 from src.logging import configure_root_logger, setup_logger
 configure_root_logger()
 logger = setup_logger(__name__)
+API = HfApi(token=os.environ.get("TOKEN"))
+RESULTS_REPO = f"open-rl-leaderboard/results"
+pp = pprint.PrettyPrinter(width=80)
+ALL_ENV_IDS = {
+    "Atari": [
+        "BeamRiderNoFrameskip-v4",
+        "BreakoutNoFrameskip-v4",
+    ],
+    "Box2D": [
+        "LunarLander-v2",
+        "BipedalWalker-v3",
+    ],
+    "Classic control": [
+        "CartPole-v1",
+        "MountainCar-v0",
+    ],
+    "MuJoCo": [
+        "Hopper-v4",
+        "HalfCheetah-v4",
+    ],
+}
 def _backend_routine():
     logger.info(f"Found {len(compatible_models)} compatible models")
     # Get the results
+    pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
+    filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
+    filenames = [filename for filename in filenames if pattern.match(filename)]
     evaluated_models = set()
+    for filename in filenames:
+        path = hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
+        with open(path) as fp:
+            report = json.load(fp)
+        evaluated_models.add((report["config"]["model_id"], report["config"]["model_sha"]))
     # Find the models that are not associated with any results
     pending_models = set(compatible_models) - evaluated_models
     logger.info(f"Found {len(pending_models)} pending models")
     # Run an evaluation on the models
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        commits = []
+        for model_id, sha in pending_models:
+            logger.info(f"Running evaluation on {model_id}")
+            report = {"config": {"model_id": model_id, "model_sha": sha}}
+            try:
+                evaluations = evaluate(model_id, revision=sha)
+            except Exception as e:
+                logger.error(f"Error evaluating {model_id}: {e}")
+                evaluations = None
+            if evaluations is not None:
+                report["results"] = evaluations
+                report["status"] = "DONE"
+            else:
+                report["status"] = "FAILED"
+            # Update the results
+            dumped = json.dumps(report, indent=2)
+            path_in_repo = f"{model_id}/results_{sha}.json"
+            local_path = os.path.join(tmp_dir, path_in_repo)
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            with open(local_path, "w") as f:
+                f.write(dumped)
+            commits.append(CommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=local_path))
+        API.create_commit(
+            repo_id=RESULTS_REPO, commit_message="Add evaluation results", operations=commits, repo_type="dataset"
         )
 def get_leaderboard_df():
+    # List all results files in results repo
+    pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
+    filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
+    filenames = [filename for filename in filenames if pattern.match(filename)]
+    data = []
+    for filename in filenames:
+        path = hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
+        with open(path) as fp:
             report = json.load(fp)
+        user_id, model_id = report["config"]["model_id"].split("/")
+        row = {"user_id": user_id, "model_id": model_id}
+        if report["status"] == "DONE" and len(report["results"]) > 0:
+            env_ids = list(report["results"].keys())
+            assert len(env_ids) == 1, "Only one environment supported for the moment"
+            row["env_id"] = env_ids[0]
+            row["mean_episodic_return"] = np.mean(report["results"][env_ids[0]]["episodic_returns"])
         data.append(row)
+    df = pd.DataFrame(data)  # create DataFrame
+    df = df.fillna("")  # replace NaN values with empty strings
     return df
 """
+def select_env(df: pd.DataFrame, env_id: str):
+    df = df[df["env_id"] == env_id]
+    # Add the ranking
+    df = df.sort_values("mean_episodic_return", ascending=False)
+    df["ranking"] = np.arange(1, len(df) + 1)
+    # Add hyperlinks
+    for index, row in df.iterrows():
+        user_id = row["user_id"]
+        model_id = row["model_id"]
+        df.loc[index, "user_id"] = f"[{user_id}](https://huggingface.co/{user_id})"
+        df.loc[index, "model_id"] = f"[{model_id}](https://huggingface.co/{user_id}/{model_id})"
+    df = df[["ranking", "user_id", "model_id", "mean_episodic_return"]]
+    return df.values.tolist()
 with gr.Blocks(js=dark_mode_gradio_js) as demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Leaderboard"):
+            df = get_leaderboard_df()
+            for env_domain, env_ids in ALL_ENV_IDS.items():
+                with gr.TabItem(env_domain):
+                    for env_id in env_ids:
+                        with gr.TabItem(env_id):
+                            with gr.Row(equal_height=False):
+                                gr.components.Dataframe(
+                                    value=select_env(df, env_id),
+                                    headers=["🏆 Ranking", "🧑 User", "🤖 Model id", "📊 Mean episodic return"],
+                                    datatype=["number", "markdown", "markdown", "number"],
+                                    row_count=(10, "fixed"),
+                                    scale=3,
+                                )
+                                gr.Video(
+                                    "https://huggingface.co/qgallouedec/MsPacmanNoFrameskip-v4-dqn_atari-seed1/resolve/main/replay.mp4",
+                                    autoplay=True,
+                                    scale=1,
+                                    min_width=50,
+                                )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(ABOUT_TEXT)
 if __name__ == "__main__":
+    demo.queue().launch()

src/envs.py DELETED Viewed

@@ -1,33 +0,0 @@
-import os
-from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("TOKEN")  # A read/write token for your org
-OWNER = "open-rl-leaderboard"  # Change to your org - don't forget to create a results and request file
-# For evaluations
-DEVICE = "cpu"  # "cuda:0" if you add compute, for evaluations
-LIMIT = 20  # !!!! Should be None for actual evaluations!!!
-# For lighteval evaluations
-ACCELERATOR = "cpu"
-REGION = "us-east-1"
-VENDOR = "aws"
-# ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-RESULTS_REPO = f"{OWNER}/results"
-# If you setup a cache later, just change HF_HOME
-CACHE_PATH = os.getenv("HF_HOME", ".")
-# Local caches
-RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-REFRESH_RATE = 1 * 60  # 1 min
-NUM_LINES_VISUALIZE = 300
-API = HfApi(token=TOKEN)

src/evaluation.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import fnmatch
 from typing import Dict, SupportsFloat
 import gymnasium as gym
 import numpy as np
 import torch
 from gymnasium import wrappers
-from huggingface_hub import hf_hub_download
 from huggingface_hub.utils._errors import EntryNotFoundError
-from src.envs import API
 from src.logging import setup_logger
 logger = setup_logger(__name__)
 ALL_ENV_IDS = [
     "CartPole-v1",

 import fnmatch
+import os
 from typing import Dict, SupportsFloat
 import gymnasium as gym
 import numpy as np
 import torch
 from gymnasium import wrappers
+from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.utils._errors import EntryNotFoundError
 from src.logging import setup_logger
 logger = setup_logger(__name__)
+API = HfApi(token=os.environ.get("TOKEN"))
 ALL_ENV_IDS = [
     "CartPole-v1",