Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Quentin Gallouédec
commited on
Commit
•
69cf5b3
1
Parent(s):
de52ad3
new version
Browse files- app.py +114 -101
- src/envs.py +0 -33
- src/evaluation.py +3 -2
app.py
CHANGED
@@ -1,32 +1,45 @@
|
|
1 |
-
import glob
|
2 |
import json
|
3 |
import os
|
4 |
import pprint
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
from apscheduler.schedulers.background import BackgroundScheduler
|
10 |
-
from huggingface_hub import
|
11 |
|
12 |
from src.css_html_js import dark_mode_gradio_js
|
13 |
-
from src.
|
14 |
-
from src.evaluation import ALL_ENV_IDS, evaluate
|
15 |
from src.logging import configure_root_logger, setup_logger
|
16 |
|
17 |
configure_root_logger()
|
18 |
logger = setup_logger(__name__)
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
def model_hyperlink(link, model_id):
|
24 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
|
25 |
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
def _backend_routine():
|
@@ -42,55 +55,51 @@ def _backend_routine():
|
|
42 |
logger.info(f"Found {len(compatible_models)} compatible models")
|
43 |
|
44 |
# Get the results
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
local_dir=RESULTS_PATH,
|
49 |
-
repo_type="dataset",
|
50 |
-
max_workers=60,
|
51 |
-
token=TOKEN,
|
52 |
-
)
|
53 |
-
json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
|
54 |
|
55 |
evaluated_models = set()
|
56 |
-
for
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
60 |
|
61 |
# Find the models that are not associated with any results
|
62 |
pending_models = set(compatible_models) - evaluated_models
|
63 |
logger.info(f"Found {len(pending_models)} pending models")
|
64 |
|
65 |
# Run an evaluation on the models
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
path_in_repo=
|
92 |
-
|
93 |
-
|
|
|
94 |
)
|
95 |
|
96 |
|
@@ -102,32 +111,27 @@ def backend_routine():
|
|
102 |
|
103 |
|
104 |
def get_leaderboard_df():
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
repo_type="dataset",
|
110 |
-
max_workers=60,
|
111 |
-
token=TOKEN,
|
112 |
-
)
|
113 |
-
|
114 |
-
json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
|
115 |
-
data = []
|
116 |
|
117 |
-
|
118 |
-
|
|
|
|
|
119 |
report = json.load(fp)
|
120 |
-
model_id = report["config"]["model_id"]
|
121 |
-
row = {"
|
122 |
-
if report["status"] == "DONE":
|
123 |
-
|
124 |
-
|
|
|
|
|
125 |
data.append(row)
|
126 |
|
127 |
-
#
|
128 |
-
df =
|
129 |
-
# Replace NaN values with empty strings
|
130 |
-
df = df.fillna("")
|
131 |
return df
|
132 |
|
133 |
|
@@ -144,39 +148,48 @@ The Open RL Leaderboard is a community-driven benchmark for reinforcement learni
|
|
144 |
"""
|
145 |
|
146 |
|
147 |
-
def
|
148 |
-
|
149 |
-
column_names = ["Agent"] + [column_name] # add model name column
|
150 |
-
df = data[column_names]
|
151 |
|
152 |
-
|
153 |
-
|
|
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
|
161 |
with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
162 |
gr.HTML(TITLE)
|
163 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
164 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
165 |
-
with gr.TabItem("🏅 Leaderboard"
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
182 |
gr.Markdown(ABOUT_TEXT)
|
@@ -188,4 +201,4 @@ scheduler.start()
|
|
188 |
|
189 |
|
190 |
if __name__ == "__main__":
|
191 |
-
demo.queue().launch()
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import pprint
|
4 |
+
import re
|
5 |
+
import tempfile
|
6 |
|
7 |
import gradio as gr
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
from apscheduler.schedulers.background import BackgroundScheduler
|
11 |
+
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
|
12 |
|
13 |
from src.css_html_js import dark_mode_gradio_js
|
14 |
+
from src.evaluation import evaluate
|
|
|
15 |
from src.logging import configure_root_logger, setup_logger
|
16 |
|
17 |
configure_root_logger()
|
18 |
logger = setup_logger(__name__)
|
19 |
|
20 |
+
API = HfApi(token=os.environ.get("TOKEN"))
|
21 |
+
RESULTS_REPO = f"open-rl-leaderboard/results"
|
|
|
|
|
|
|
22 |
|
23 |
+
pp = pprint.PrettyPrinter(width=80)
|
24 |
|
25 |
+
ALL_ENV_IDS = {
|
26 |
+
"Atari": [
|
27 |
+
"BeamRiderNoFrameskip-v4",
|
28 |
+
"BreakoutNoFrameskip-v4",
|
29 |
+
],
|
30 |
+
"Box2D": [
|
31 |
+
"LunarLander-v2",
|
32 |
+
"BipedalWalker-v3",
|
33 |
+
],
|
34 |
+
"Classic control": [
|
35 |
+
"CartPole-v1",
|
36 |
+
"MountainCar-v0",
|
37 |
+
],
|
38 |
+
"MuJoCo": [
|
39 |
+
"Hopper-v4",
|
40 |
+
"HalfCheetah-v4",
|
41 |
+
],
|
42 |
+
}
|
43 |
|
44 |
|
45 |
def _backend_routine():
|
|
|
55 |
logger.info(f"Found {len(compatible_models)} compatible models")
|
56 |
|
57 |
# Get the results
|
58 |
+
pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
|
59 |
+
filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
|
60 |
+
filenames = [filename for filename in filenames if pattern.match(filename)]
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
evaluated_models = set()
|
63 |
+
for filename in filenames:
|
64 |
+
path = hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
|
65 |
+
with open(path) as fp:
|
66 |
+
report = json.load(fp)
|
67 |
+
evaluated_models.add((report["config"]["model_id"], report["config"]["model_sha"]))
|
68 |
|
69 |
# Find the models that are not associated with any results
|
70 |
pending_models = set(compatible_models) - evaluated_models
|
71 |
logger.info(f"Found {len(pending_models)} pending models")
|
72 |
|
73 |
# Run an evaluation on the models
|
74 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
75 |
+
commits = []
|
76 |
+
for model_id, sha in pending_models:
|
77 |
+
logger.info(f"Running evaluation on {model_id}")
|
78 |
+
report = {"config": {"model_id": model_id, "model_sha": sha}}
|
79 |
+
try:
|
80 |
+
evaluations = evaluate(model_id, revision=sha)
|
81 |
+
except Exception as e:
|
82 |
+
logger.error(f"Error evaluating {model_id}: {e}")
|
83 |
+
evaluations = None
|
84 |
+
|
85 |
+
if evaluations is not None:
|
86 |
+
report["results"] = evaluations
|
87 |
+
report["status"] = "DONE"
|
88 |
+
else:
|
89 |
+
report["status"] = "FAILED"
|
90 |
+
|
91 |
+
# Update the results
|
92 |
+
dumped = json.dumps(report, indent=2)
|
93 |
+
path_in_repo = f"{model_id}/results_{sha}.json"
|
94 |
+
local_path = os.path.join(tmp_dir, path_in_repo)
|
95 |
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
96 |
+
with open(local_path, "w") as f:
|
97 |
+
f.write(dumped)
|
98 |
+
|
99 |
+
commits.append(CommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=local_path))
|
100 |
+
|
101 |
+
API.create_commit(
|
102 |
+
repo_id=RESULTS_REPO, commit_message="Add evaluation results", operations=commits, repo_type="dataset"
|
103 |
)
|
104 |
|
105 |
|
|
|
111 |
|
112 |
|
113 |
def get_leaderboard_df():
|
114 |
+
# List all results files in results repo
|
115 |
+
pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
|
116 |
+
filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
|
117 |
+
filenames = [filename for filename in filenames if pattern.match(filename)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
+
data = []
|
120 |
+
for filename in filenames:
|
121 |
+
path = hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
|
122 |
+
with open(path) as fp:
|
123 |
report = json.load(fp)
|
124 |
+
user_id, model_id = report["config"]["model_id"].split("/")
|
125 |
+
row = {"user_id": user_id, "model_id": model_id}
|
126 |
+
if report["status"] == "DONE" and len(report["results"]) > 0:
|
127 |
+
env_ids = list(report["results"].keys())
|
128 |
+
assert len(env_ids) == 1, "Only one environment supported for the moment"
|
129 |
+
row["env_id"] = env_ids[0]
|
130 |
+
row["mean_episodic_return"] = np.mean(report["results"][env_ids[0]]["episodic_returns"])
|
131 |
data.append(row)
|
132 |
|
133 |
+
df = pd.DataFrame(data) # create DataFrame
|
134 |
+
df = df.fillna("") # replace NaN values with empty strings
|
|
|
|
|
135 |
return df
|
136 |
|
137 |
|
|
|
148 |
"""
|
149 |
|
150 |
|
151 |
+
def select_env(df: pd.DataFrame, env_id: str):
|
152 |
+
df = df[df["env_id"] == env_id]
|
|
|
|
|
153 |
|
154 |
+
# Add the ranking
|
155 |
+
df = df.sort_values("mean_episodic_return", ascending=False)
|
156 |
+
df["ranking"] = np.arange(1, len(df) + 1)
|
157 |
|
158 |
+
# Add hyperlinks
|
159 |
+
for index, row in df.iterrows():
|
160 |
+
user_id = row["user_id"]
|
161 |
+
model_id = row["model_id"]
|
162 |
+
df.loc[index, "user_id"] = f"[{user_id}](https://huggingface.co/{user_id})"
|
163 |
+
df.loc[index, "model_id"] = f"[{model_id}](https://huggingface.co/{user_id}/{model_id})"
|
164 |
+
|
165 |
+
df = df[["ranking", "user_id", "model_id", "mean_episodic_return"]]
|
166 |
+
return df.values.tolist()
|
167 |
|
168 |
|
169 |
with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
170 |
gr.HTML(TITLE)
|
171 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
172 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
173 |
+
with gr.TabItem("🏅 Leaderboard"):
|
174 |
+
df = get_leaderboard_df()
|
175 |
+
for env_domain, env_ids in ALL_ENV_IDS.items():
|
176 |
+
with gr.TabItem(env_domain):
|
177 |
+
for env_id in env_ids:
|
178 |
+
with gr.TabItem(env_id):
|
179 |
+
with gr.Row(equal_height=False):
|
180 |
+
gr.components.Dataframe(
|
181 |
+
value=select_env(df, env_id),
|
182 |
+
headers=["🏆 Ranking", "🧑 User", "🤖 Model id", "📊 Mean episodic return"],
|
183 |
+
datatype=["number", "markdown", "markdown", "number"],
|
184 |
+
row_count=(10, "fixed"),
|
185 |
+
scale=3,
|
186 |
+
)
|
187 |
+
gr.Video(
|
188 |
+
"https://huggingface.co/qgallouedec/MsPacmanNoFrameskip-v4-dqn_atari-seed1/resolve/main/replay.mp4",
|
189 |
+
autoplay=True,
|
190 |
+
scale=1,
|
191 |
+
min_width=50,
|
192 |
+
)
|
193 |
|
194 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
195 |
gr.Markdown(ABOUT_TEXT)
|
|
|
201 |
|
202 |
|
203 |
if __name__ == "__main__":
|
204 |
+
demo.queue().launch()
|
src/envs.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
from huggingface_hub import HfApi
|
4 |
-
|
5 |
-
# Info to change for your repository
|
6 |
-
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
-
|
9 |
-
OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
|
10 |
-
|
11 |
-
# For evaluations
|
12 |
-
DEVICE = "cpu" # "cuda:0" if you add compute, for evaluations
|
13 |
-
LIMIT = 20 # !!!! Should be None for actual evaluations!!!
|
14 |
-
|
15 |
-
# For lighteval evaluations
|
16 |
-
ACCELERATOR = "cpu"
|
17 |
-
REGION = "us-east-1"
|
18 |
-
VENDOR = "aws"
|
19 |
-
# ----------------------------------
|
20 |
-
|
21 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
22 |
-
RESULTS_REPO = f"{OWNER}/results"
|
23 |
-
|
24 |
-
# If you setup a cache later, just change HF_HOME
|
25 |
-
CACHE_PATH = os.getenv("HF_HOME", ".")
|
26 |
-
|
27 |
-
# Local caches
|
28 |
-
RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
29 |
-
|
30 |
-
REFRESH_RATE = 1 * 60 # 1 min
|
31 |
-
NUM_LINES_VISUALIZE = 300
|
32 |
-
|
33 |
-
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/evaluation.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
import fnmatch
|
|
|
2 |
from typing import Dict, SupportsFloat
|
3 |
|
4 |
import gymnasium as gym
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
from gymnasium import wrappers
|
8 |
-
from huggingface_hub import hf_hub_download
|
9 |
from huggingface_hub.utils._errors import EntryNotFoundError
|
10 |
|
11 |
-
from src.envs import API
|
12 |
from src.logging import setup_logger
|
13 |
|
14 |
logger = setup_logger(__name__)
|
15 |
|
|
|
16 |
|
17 |
ALL_ENV_IDS = [
|
18 |
"CartPole-v1",
|
|
|
1 |
import fnmatch
|
2 |
+
import os
|
3 |
from typing import Dict, SupportsFloat
|
4 |
|
5 |
import gymnasium as gym
|
6 |
import numpy as np
|
7 |
import torch
|
8 |
from gymnasium import wrappers
|
9 |
+
from huggingface_hub import HfApi, hf_hub_download
|
10 |
from huggingface_hub.utils._errors import EntryNotFoundError
|
11 |
|
|
|
12 |
from src.logging import setup_logger
|
13 |
|
14 |
logger = setup_logger(__name__)
|
15 |
|
16 |
+
API = HfApi(token=os.environ.get("TOKEN"))
|
17 |
|
18 |
ALL_ENV_IDS = [
|
19 |
"CartPole-v1",
|