albertvillanova HF staff commited on
Commit
7379857
1 Parent(s): 5b4c5f8

Add Details tab

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py CHANGED
@@ -19,6 +19,8 @@ EXCLUDED_KEYS = {
19
  # "alias",
20
  # }
21
 
 
 
22
 
23
  TASKS = {
24
  "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
@@ -29,6 +31,57 @@ TASKS = {
29
  "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
30
  "leaderboard_musr": ("MuSR", "leaderboard_musr"),
31
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  fs = HfFileSystem()
34
 
@@ -103,6 +156,49 @@ def update_tasks(task):
103
  )
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # if __name__ == "__main__":
107
  latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
108
 
@@ -135,6 +231,18 @@ with gr.Blocks(fill_height=True) as demo:
135
  results = gr.HTML()
136
  with gr.Tab("Configs"):
137
  configs = gr.HTML()
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  load_btn_1.click(
140
  fn=load_result_dataframe,
@@ -166,6 +274,29 @@ with gr.Blocks(fill_height=True) as demo:
166
  fn=display_results,
167
  inputs=[dataframe_1, dataframe_2, task],
168
  outputs=[results, configs],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  )
170
 
171
  demo.launch()
 
19
  # "alias",
20
  # }
21
 
22
+ DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
23
+ DETAILS_FILENAME = "samples_{subtask}_*.json"
24
 
25
  TASKS = {
26
  "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
 
31
  "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
32
  "leaderboard_musr": ("MuSR", "leaderboard_musr"),
33
  }
34
+ SUBTASKS = {
35
+ "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
36
+ "leaderboard_bbh": [
37
+ "leaderboard_bbh_boolean_expressions",
38
+ "leaderboard_bbh_causal_judgement",
39
+ "leaderboard_bbh_date_understanding",
40
+ "leaderboard_bbh_disambiguation_qa",
41
+ "leaderboard_bbh_formal_fallacies",
42
+ "leaderboard_bbh_geometric_shapes",
43
+ "leaderboard_bbh_hyperbaton",
44
+ "leaderboard_bbh_logical_deduction_five_objects",
45
+ "leaderboard_bbh_logical_deduction_seven_objects",
46
+ "leaderboard_bbh_logical_deduction_three_objects",
47
+ "leaderboard_bbh_movie_recommendation",
48
+ "leaderboard_bbh_navigate",
49
+ "leaderboard_bbh_object_counting",
50
+ "leaderboard_bbh_penguins_in_a_table",
51
+ "leaderboard_bbh_reasoning_about_colored_objects",
52
+ "leaderboard_bbh_ruin_names",
53
+ "leaderboard_bbh_salient_translation_error_detection",
54
+ "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
55
+ "leaderboard_bbh_temporal_sequences",
56
+ "leaderboard_bbh_tracking_shuffled_objects_five_objects",
57
+ "leaderboard_bbh_tracking_shuffled_objects_seven_objects",
58
+ "leaderboard_bbh_tracking_shuffled_objects_three_objects",
59
+ "leaderboard_bbh_web_of_lies",
60
+ ],
61
+ "leaderboard_gpqa": [
62
+ "leaderboard_gpqa_extended",
63
+ "leaderboard_gpqa_diamond",
64
+ "leaderboard_gpqa_main",
65
+ ],
66
+ "leaderboard_ifeval": ["leaderboard_ifeval"],
67
+ # "leaderboard_math_hard": [
68
+ "leaderboard_math": [
69
+ "leaderboard_math_algebra_hard",
70
+ "leaderboard_math_counting_and_prob_hard",
71
+ "leaderboard_math_geometry_hard",
72
+ "leaderboard_math_intermediate_algebra_hard",
73
+ "leaderboard_math_num_theory_hard",
74
+ "leaderboard_math_prealgebra_hard",
75
+ "leaderboard_math_precalculus_hard",
76
+ ],
77
+ "leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
78
+ "leaderboard_musr": [
79
+ "leaderboard_musr_murder_mysteries",
80
+ "leaderboard_musr_object_placements",
81
+ "leaderboard_musr_team_allocation",
82
+ ],
83
+ }
84
+
85
 
86
  fs = HfFileSystem()
87
 
 
156
  )
157
 
158
 
159
+ def update_subtasks(task):
160
+ return gr.Radio(
161
+ SUBTASKS.get(task),
162
+ info="Evaluation subtasks to be displayed",
163
+ )
164
+
165
+
166
+ def load_details_dataframe(model_id, subtask):
167
+ if not model_id or not subtask:
168
+ return
169
+ model_name_sanitized = model_id.replace("/", "__")
170
+ paths = fs.glob(
171
+ f"{DETAILS_DATASET_ID}/**/{DETAILS_FILENAME}".format(
172
+ model_name_sanitized=model_name_sanitized, subtask=subtask
173
+ )
174
+ )
175
+ if not paths:
176
+ return
177
+ path = max(paths)
178
+ with fs.open(path, "r") as f:
179
+ data = [json.loads(line) for line in f]
180
+ df = pd.json_normalize(data)
181
+ # df = df.rename_axis("Parameters", axis="columns")
182
+ df["model_name"] = model_id # Keep model_name
183
+ return df
184
+ # return df.set_index(pd.Index([model_id])).reset_index()
185
+
186
+
187
+ def display_details(df_1, df_2, sample_idx):
188
+ s_1 = df_1.iloc[sample_idx]
189
+ s_2 = df_2.iloc[sample_idx]
190
+ # Pop model_name and add it to the column name
191
+ s_1 = s_1.rename(s_1.pop("model_name"))
192
+ s_2 = s_2.rename(s_2.pop("model_name"))
193
+ df = pd.concat([s_1, s_2], axis="columns")#.rename_axis("Parameters").reset_index()
194
+ return (
195
+ df.style
196
+ .format(na_rep="")
197
+ # .hide(axis="index")
198
+ .to_html()
199
+ )
200
+
201
+
202
  # if __name__ == "__main__":
203
  latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
204
 
 
231
  results = gr.HTML()
232
  with gr.Tab("Configs"):
233
  configs = gr.HTML()
234
+ with gr.Tab("Details"):
235
+ subtask = gr.Radio(
236
+ SUBTASKS.get(task.value),
237
+ label="Subtasks",
238
+ info="Evaluation subtasks to be displayed (choose one of the Tasks above)",
239
+ )
240
+ sample_idx = gr.Number(value=0, label="Sample Index", info="Index of the sample to be displayed", minimum=0)
241
+ load_details_btn = gr.Button("Load Details")
242
+ details = gr.HTML()
243
+ details_dataframe_1 = gr.Dataframe(visible=False)
244
+ details_dataframe_2 = gr.Dataframe(visible=False)
245
+ details_dataframe = gr.DataFrame(visible=False)
246
 
247
  load_btn_1.click(
248
  fn=load_result_dataframe,
 
274
  fn=display_results,
275
  inputs=[dataframe_1, dataframe_2, task],
276
  outputs=[results, configs],
277
+ ).then(
278
+ fn=update_subtasks,
279
+ inputs=task,
280
+ outputs=subtask,
281
+ )
282
+
283
+ load_details_btn.click(
284
+ fn=load_details_dataframe,
285
+ inputs=[model_id_1, subtask],
286
+ outputs=details_dataframe_1,
287
+ ).then(
288
+ fn=load_details_dataframe,
289
+ inputs=[model_id_2, subtask],
290
+ outputs=details_dataframe_2,
291
+ ).then(
292
+ fn=display_details,
293
+ inputs=[details_dataframe_1, details_dataframe_2, sample_idx],
294
+ outputs=details,
295
+ )
296
+ sample_idx.change(
297
+ fn=display_details,
298
+ inputs=[details_dataframe_1, details_dataframe_2, sample_idx],
299
+ outputs=details,
300
  )
301
 
302
  demo.launch()