Quentin Gallouédec commited on
Commit
69cf5b3
1 Parent(s): de52ad3

new version

Browse files
Files changed (3) hide show
  1. app.py +114 -101
  2. src/envs.py +0 -33
  3. src/evaluation.py +3 -2
app.py CHANGED
@@ -1,32 +1,45 @@
1
- import glob
2
  import json
3
  import os
4
  import pprint
 
 
5
 
6
  import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
- from huggingface_hub import snapshot_download
11
 
12
  from src.css_html_js import dark_mode_gradio_js
13
- from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
14
- from src.evaluation import ALL_ENV_IDS, evaluate
15
  from src.logging import configure_root_logger, setup_logger
16
 
17
  configure_root_logger()
18
  logger = setup_logger(__name__)
19
 
20
- pp = pprint.PrettyPrinter(width=80)
21
-
22
-
23
- def model_hyperlink(link, model_id):
24
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
25
 
 
26
 
27
- def make_clickable_model(model_id):
28
- link = f"https://huggingface.co/{model_id}"
29
- return model_hyperlink(link, model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  def _backend_routine():
@@ -42,55 +55,51 @@ def _backend_routine():
42
  logger.info(f"Found {len(compatible_models)} compatible models")
43
 
44
  # Get the results
45
- snapshot_download(
46
- repo_id=RESULTS_REPO,
47
- revision="main",
48
- local_dir=RESULTS_PATH,
49
- repo_type="dataset",
50
- max_workers=60,
51
- token=TOKEN,
52
- )
53
- json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
54
 
55
  evaluated_models = set()
56
- for json_filepath in json_files:
57
- with open(json_filepath) as fp:
58
- data = json.load(fp)
59
- evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))
 
60
 
61
  # Find the models that are not associated with any results
62
  pending_models = set(compatible_models) - evaluated_models
63
  logger.info(f"Found {len(pending_models)} pending models")
64
 
65
  # Run an evaluation on the models
66
- for model_id, sha in pending_models:
67
- logger.info(f"Running evaluation on {model_id}")
68
- report = {"config": {"model_id": model_id, "model_sha": sha}}
69
- try:
70
- evaluations = evaluate(model_id, revision=sha)
71
- except Exception as e:
72
- logger.error(f"Error evaluating {model_id}: {e}")
73
- evaluations = None
74
-
75
- if evaluations is not None:
76
- report["results"] = evaluations
77
- report["status"] = "DONE"
78
- else:
79
- report["status"] = "FAILED"
80
-
81
- # Update the results
82
- dumped = json.dumps(report, indent=2)
83
- output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
84
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
85
- with open(output_path, "w") as f:
86
- f.write(dumped)
87
-
88
- # Upload the results to the results repo
89
- API.upload_file(
90
- path_or_fileobj=output_path,
91
- path_in_repo=f"{model_id}/results_{sha}.json",
92
- repo_id=RESULTS_REPO,
93
- repo_type="dataset",
 
94
  )
95
 
96
 
@@ -102,32 +111,27 @@ def backend_routine():
102
 
103
 
104
  def get_leaderboard_df():
105
- snapshot_download(
106
- repo_id=RESULTS_REPO,
107
- revision="main",
108
- local_dir=RESULTS_PATH,
109
- repo_type="dataset",
110
- max_workers=60,
111
- token=TOKEN,
112
- )
113
-
114
- json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
115
- data = []
116
 
117
- for json_filepath in json_files:
118
- with open(json_filepath) as fp:
 
 
119
  report = json.load(fp)
120
- model_id = report["config"]["model_id"]
121
- row = {"Agent": model_id, "Status": report["status"]}
122
- if report["status"] == "DONE":
123
- results = {env_id: np.mean(result["episodic_return"]) for env_id, result in report["results"].items()}
124
- row.update(results)
 
 
125
  data.append(row)
126
 
127
- # Create DataFrame
128
- df = pd.DataFrame(data)
129
- # Replace NaN values with empty strings
130
- df = df.fillna("")
131
  return df
132
 
133
 
@@ -144,39 +148,48 @@ The Open RL Leaderboard is a community-driven benchmark for reinforcement learni
144
  """
145
 
146
 
147
- def select_column(column_name: str, data: pd.DataFrame):
148
- # column_names = [col for col in column_names if col in data.columns]
149
- column_names = ["Agent"] + [column_name] # add model name column
150
- df = data[column_names]
151
 
152
- def check_row(row):
153
- return not (row.drop("Agent") == "").all()
 
154
 
155
- mask = df.apply(check_row, axis=1)
156
- df = df[mask]
157
- df = df.sort_values(by=column_name, ascending=False)
158
- return df
 
 
 
 
 
159
 
160
 
161
  with gr.Blocks(js=dark_mode_gradio_js) as demo:
162
  gr.HTML(TITLE)
163
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
164
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
165
- with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
166
- hidden_df = gr.components.Dataframe(get_leaderboard_df, visible=False, every=5 * 60) # hidden dataframe
167
-
168
- env_selector = gr.components.Dropdown(
169
- label="Environments",
170
- choices=ALL_ENV_IDS,
171
- value=ALL_ENV_IDS[0],
172
- # interactive=True,
173
- )
174
- leaderboard = gr.components.Dataframe(select_column(ALL_ENV_IDS[0], get_leaderboard_df()))
175
-
176
- # Events
177
- env_selector.change(select_column, [env_selector, hidden_df], leaderboard)
178
- # Update hidden dataframe
179
- # hidden_df.change(get_leaderboard_df, [], hidden_df, every=10)
 
 
 
 
 
180
 
181
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
182
  gr.Markdown(ABOUT_TEXT)
@@ -188,4 +201,4 @@ scheduler.start()
188
 
189
 
190
  if __name__ == "__main__":
191
- demo.queue().launch() # server_name="0.0.0.0", show_error=True, server_port=7860)
 
 
1
  import json
2
  import os
3
  import pprint
4
+ import re
5
+ import tempfile
6
 
7
  import gradio as gr
8
  import numpy as np
9
  import pandas as pd
10
  from apscheduler.schedulers.background import BackgroundScheduler
11
+ from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
12
 
13
  from src.css_html_js import dark_mode_gradio_js
14
+ from src.evaluation import evaluate
 
15
  from src.logging import configure_root_logger, setup_logger
16
 
17
  configure_root_logger()
18
  logger = setup_logger(__name__)
19
 
20
+ API = HfApi(token=os.environ.get("TOKEN"))
21
+ RESULTS_REPO = f"open-rl-leaderboard/results"
 
 
 
22
 
23
+ pp = pprint.PrettyPrinter(width=80)
24
 
25
+ ALL_ENV_IDS = {
26
+ "Atari": [
27
+ "BeamRiderNoFrameskip-v4",
28
+ "BreakoutNoFrameskip-v4",
29
+ ],
30
+ "Box2D": [
31
+ "LunarLander-v2",
32
+ "BipedalWalker-v3",
33
+ ],
34
+ "Classic control": [
35
+ "CartPole-v1",
36
+ "MountainCar-v0",
37
+ ],
38
+ "MuJoCo": [
39
+ "Hopper-v4",
40
+ "HalfCheetah-v4",
41
+ ],
42
+ }
43
 
44
 
45
  def _backend_routine():
 
55
  logger.info(f"Found {len(compatible_models)} compatible models")
56
 
57
  # Get the results
58
+ pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
59
+ filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
60
+ filenames = [filename for filename in filenames if pattern.match(filename)]
 
 
 
 
 
 
61
 
62
  evaluated_models = set()
63
+ for filename in filenames:
64
+ path = hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
65
+ with open(path) as fp:
66
+ report = json.load(fp)
67
+ evaluated_models.add((report["config"]["model_id"], report["config"]["model_sha"]))
68
 
69
  # Find the models that are not associated with any results
70
  pending_models = set(compatible_models) - evaluated_models
71
  logger.info(f"Found {len(pending_models)} pending models")
72
 
73
  # Run an evaluation on the models
74
+ with tempfile.TemporaryDirectory() as tmp_dir:
75
+ commits = []
76
+ for model_id, sha in pending_models:
77
+ logger.info(f"Running evaluation on {model_id}")
78
+ report = {"config": {"model_id": model_id, "model_sha": sha}}
79
+ try:
80
+ evaluations = evaluate(model_id, revision=sha)
81
+ except Exception as e:
82
+ logger.error(f"Error evaluating {model_id}: {e}")
83
+ evaluations = None
84
+
85
+ if evaluations is not None:
86
+ report["results"] = evaluations
87
+ report["status"] = "DONE"
88
+ else:
89
+ report["status"] = "FAILED"
90
+
91
+ # Update the results
92
+ dumped = json.dumps(report, indent=2)
93
+ path_in_repo = f"{model_id}/results_{sha}.json"
94
+ local_path = os.path.join(tmp_dir, path_in_repo)
95
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
96
+ with open(local_path, "w") as f:
97
+ f.write(dumped)
98
+
99
+ commits.append(CommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=local_path))
100
+
101
+ API.create_commit(
102
+ repo_id=RESULTS_REPO, commit_message="Add evaluation results", operations=commits, repo_type="dataset"
103
  )
104
 
105
 
 
111
 
112
 
113
  def get_leaderboard_df():
114
+ # List all results files in results repo
115
+ pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
116
+ filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
117
+ filenames = [filename for filename in filenames if pattern.match(filename)]
 
 
 
 
 
 
 
118
 
119
+ data = []
120
+ for filename in filenames:
121
+ path = hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
122
+ with open(path) as fp:
123
  report = json.load(fp)
124
+ user_id, model_id = report["config"]["model_id"].split("/")
125
+ row = {"user_id": user_id, "model_id": model_id}
126
+ if report["status"] == "DONE" and len(report["results"]) > 0:
127
+ env_ids = list(report["results"].keys())
128
+ assert len(env_ids) == 1, "Only one environment supported for the moment"
129
+ row["env_id"] = env_ids[0]
130
+ row["mean_episodic_return"] = np.mean(report["results"][env_ids[0]]["episodic_returns"])
131
  data.append(row)
132
 
133
+ df = pd.DataFrame(data) # create DataFrame
134
+ df = df.fillna("") # replace NaN values with empty strings
 
 
135
  return df
136
 
137
 
 
148
  """
149
 
150
 
151
+ def select_env(df: pd.DataFrame, env_id: str):
152
+ df = df[df["env_id"] == env_id]
 
 
153
 
154
+ # Add the ranking
155
+ df = df.sort_values("mean_episodic_return", ascending=False)
156
+ df["ranking"] = np.arange(1, len(df) + 1)
157
 
158
+ # Add hyperlinks
159
+ for index, row in df.iterrows():
160
+ user_id = row["user_id"]
161
+ model_id = row["model_id"]
162
+ df.loc[index, "user_id"] = f"[{user_id}](https://huggingface.co/{user_id})"
163
+ df.loc[index, "model_id"] = f"[{model_id}](https://huggingface.co/{user_id}/{model_id})"
164
+
165
+ df = df[["ranking", "user_id", "model_id", "mean_episodic_return"]]
166
+ return df.values.tolist()
167
 
168
 
169
  with gr.Blocks(js=dark_mode_gradio_js) as demo:
170
  gr.HTML(TITLE)
171
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
172
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
173
+ with gr.TabItem("🏅 Leaderboard"):
174
+ df = get_leaderboard_df()
175
+ for env_domain, env_ids in ALL_ENV_IDS.items():
176
+ with gr.TabItem(env_domain):
177
+ for env_id in env_ids:
178
+ with gr.TabItem(env_id):
179
+ with gr.Row(equal_height=False):
180
+ gr.components.Dataframe(
181
+ value=select_env(df, env_id),
182
+ headers=["🏆 Ranking", "🧑 User", "🤖 Model id", "📊 Mean episodic return"],
183
+ datatype=["number", "markdown", "markdown", "number"],
184
+ row_count=(10, "fixed"),
185
+ scale=3,
186
+ )
187
+ gr.Video(
188
+ "https://huggingface.co/qgallouedec/MsPacmanNoFrameskip-v4-dqn_atari-seed1/resolve/main/replay.mp4",
189
+ autoplay=True,
190
+ scale=1,
191
+ min_width=50,
192
+ )
193
 
194
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
195
  gr.Markdown(ABOUT_TEXT)
 
201
 
202
 
203
  if __name__ == "__main__":
204
+ demo.queue().launch()
src/envs.py DELETED
@@ -1,33 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
-
9
- OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
10
-
11
- # For evaluations
12
- DEVICE = "cpu" # "cuda:0" if you add compute, for evaluations
13
- LIMIT = 20 # !!!! Should be None for actual evaluations!!!
14
-
15
- # For lighteval evaluations
16
- ACCELERATOR = "cpu"
17
- REGION = "us-east-1"
18
- VENDOR = "aws"
19
- # ----------------------------------
20
-
21
- REPO_ID = f"{OWNER}/leaderboard"
22
- RESULTS_REPO = f"{OWNER}/results"
23
-
24
- # If you setup a cache later, just change HF_HOME
25
- CACHE_PATH = os.getenv("HF_HOME", ".")
26
-
27
- # Local caches
28
- RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
29
-
30
- REFRESH_RATE = 1 * 60 # 1 min
31
- NUM_LINES_VISUALIZE = 300
32
-
33
- API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/evaluation.py CHANGED
@@ -1,18 +1,19 @@
1
  import fnmatch
 
2
  from typing import Dict, SupportsFloat
3
 
4
  import gymnasium as gym
5
  import numpy as np
6
  import torch
7
  from gymnasium import wrappers
8
- from huggingface_hub import hf_hub_download
9
  from huggingface_hub.utils._errors import EntryNotFoundError
10
 
11
- from src.envs import API
12
  from src.logging import setup_logger
13
 
14
  logger = setup_logger(__name__)
15
 
 
16
 
17
  ALL_ENV_IDS = [
18
  "CartPole-v1",
 
1
  import fnmatch
2
+ import os
3
  from typing import Dict, SupportsFloat
4
 
5
  import gymnasium as gym
6
  import numpy as np
7
  import torch
8
  from gymnasium import wrappers
9
+ from huggingface_hub import HfApi, hf_hub_download
10
  from huggingface_hub.utils._errors import EntryNotFoundError
11
 
 
12
  from src.logging import setup_logger
13
 
14
  logger = setup_logger(__name__)
15
 
16
+ API = HfApi(token=os.environ.get("TOKEN"))
17
 
18
  ALL_ENV_IDS = [
19
  "CartPole-v1",