luanagbmartins commited on
Commit
3a55cb3
β€’
1 Parent(s): b32e3ed

filter by benchmark

Browse files
Files changed (5) hide show
  1. app.py +159 -101
  2. src/about.py +45 -3
  3. src/display/utils.py +11 -5
  4. src/leaderboard/read_evals.py +29 -27
  5. src/populate.py +0 -4
app.py CHANGED
@@ -11,7 +11,9 @@ from src.about import (
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
 
14
  )
 
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
@@ -59,8 +61,12 @@ try:
59
  except Exception:
60
  restart_space()
61
 
62
-
63
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
64
 
65
  (
66
  finished_eval_queue_df,
@@ -85,124 +91,176 @@ def init_leaderboard(dataframe):
85
  filter_columns=[
86
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
87
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
88
- ColumnFilter(
89
- AutoEvalColumn.params.name,
90
- type="slider",
91
- min=0.01,
92
- max=150,
93
- label="Select the number of parameters (B)",
94
- ),
95
- ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
96
  ],
97
- bool_checkboxgroup_label="Hide models",
98
  interactive=False,
99
  )
100
 
101
 
 
102
  demo = gr.Blocks(css=custom_css)
103
  with demo:
104
  gr.HTML(TITLE)
105
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
106
 
107
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
108
- with gr.TabItem("πŸ… Position Bias Analyzer", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
 
 
 
 
109
  leaderboard = init_leaderboard(LEADERBOARD_DF)
110
 
111
- with gr.TabItem("πŸ… LLMBar Natural", elem_id="llm-benchmark-tab-table", id=4):
 
 
 
 
 
 
 
112
  leaderboard = init_leaderboard(LEADERBOARD_DF)
113
 
114
- with gr.TabItem("πŸ… LLMBar Adversarial (Manual)", elem_id="llm-benchmark-tab-table", id=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  leaderboard = init_leaderboard(LEADERBOARD_DF)
116
 
117
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
119
 
120
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
121
- with gr.Column():
122
- with gr.Row():
123
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
124
-
125
- with gr.Column():
126
- with gr.Accordion(
127
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
128
- open=False,
129
- ):
130
- with gr.Row():
131
- finished_eval_table = gr.components.Dataframe(
132
- value=finished_eval_queue_df,
133
- headers=EVAL_COLS,
134
- datatype=EVAL_TYPES,
135
- row_count=5,
136
- )
137
- with gr.Accordion(
138
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
139
- open=False,
140
- ):
141
- with gr.Row():
142
- running_eval_table = gr.components.Dataframe(
143
- value=running_eval_queue_df,
144
- headers=EVAL_COLS,
145
- datatype=EVAL_TYPES,
146
- row_count=5,
147
- )
148
-
149
- with gr.Accordion(
150
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
151
- open=False,
152
- ):
153
- with gr.Row():
154
- pending_eval_table = gr.components.Dataframe(
155
- value=pending_eval_queue_df,
156
- headers=EVAL_COLS,
157
- datatype=EVAL_TYPES,
158
- row_count=5,
159
- )
160
- with gr.Row():
161
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
162
-
163
- with gr.Row():
164
- with gr.Column():
165
- model_name_textbox = gr.Textbox(label="Model name")
166
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
167
- model_type = gr.Dropdown(
168
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
169
- label="Model type",
170
- multiselect=False,
171
- value=None,
172
- interactive=True,
173
- )
174
-
175
- with gr.Column():
176
- precision = gr.Dropdown(
177
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
178
- label="Precision",
179
- multiselect=False,
180
- value="float16",
181
- interactive=True,
182
- )
183
- weight_type = gr.Dropdown(
184
- choices=[i.value.name for i in WeightType],
185
- label="Weights type",
186
- multiselect=False,
187
- value="Original",
188
- interactive=True,
189
- )
190
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
191
-
192
- submit_button = gr.Button("Submit Eval")
193
- submission_result = gr.Markdown()
194
- submit_button.click(
195
- add_new_eval,
196
- [
197
- model_name_textbox,
198
- base_model_name_textbox,
199
- revision_name_textbox,
200
- precision,
201
- weight_type,
202
- model_type,
203
- ],
204
- submission_result,
205
- )
206
 
207
  with gr.Row():
208
  with gr.Accordion("πŸ“™ Citation", open=False):
 
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
+ Tasks,
15
  )
16
+
17
  from src.display.css_html_js import custom_css
18
  from src.display.utils import (
19
  BENCHMARK_COLS,
 
61
  except Exception:
62
  restart_space()
63
 
64
+ LEADERBOARD_DF = get_leaderboard_df(
65
+ EVAL_RESULTS_PATH,
66
+ EVAL_REQUESTS_PATH,
67
+ COLS,
68
+ BENCHMARK_COLS,
69
+ )
70
 
71
  (
72
  finished_eval_queue_df,
 
91
  filter_columns=[
92
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
93
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
94
+ # ColumnFilter(
95
+ # AutoEvalColumn.params.name,
96
+ # type="slider",
97
+ # min=0.01,
98
+ # max=150,
99
+ # label="Select the number of parameters (B)",
100
+ # ),
101
+ # ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
102
  ],
103
+ # bool_checkboxgroup_label="Hide models",
104
  interactive=False,
105
  )
106
 
107
 
108
+ task_map = {getattr(Tasks, t).value.col_name: getattr(Tasks, t).name for t in dir(Tasks) if not t.startswith("_")}
109
  demo = gr.Blocks(css=custom_css)
110
  with demo:
111
  gr.HTML(TITLE)
112
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
113
 
114
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
115
+ with gr.TabItem("[ENG] Position Bias Analyzer", elem_id="llm-benchmark-tab-table", id=0):
116
+ for filter in BENCHMARK_COLS:
117
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = True
118
+
119
+ FILTERED_COLS = [c for c in BENCHMARK_COLS if not c.startswith("[ENG-P]")]
120
+ for filter in FILTERED_COLS:
121
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = False
122
+
123
  leaderboard = init_leaderboard(LEADERBOARD_DF)
124
 
125
+ with gr.TabItem("[PT] Position Bias Analyzer", elem_id="llm-benchmark-tab-table", id=1):
126
+ for filter in BENCHMARK_COLS:
127
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = True
128
+
129
+ FILTERED_COLS = [c for c in BENCHMARK_COLS if not c.startswith("[PT-P]")]
130
+ for filter in FILTERED_COLS:
131
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = False
132
+
133
  leaderboard = init_leaderboard(LEADERBOARD_DF)
134
 
135
+ with gr.TabItem("[ENG] LLMBar Natural", elem_id="llm-benchmark-tab-table", id=2):
136
+ for filter in BENCHMARK_COLS:
137
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = True
138
+
139
+ FILTERED_COLS = [c for c in BENCHMARK_COLS if not c.startswith("[ENG-N]")]
140
+ for filter in FILTERED_COLS:
141
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = False
142
+
143
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
144
+
145
+ with gr.TabItem("[PT] LLMBar Natural", elem_id="llm-benchmark-tab-table", id=3):
146
+ for filter in BENCHMARK_COLS:
147
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = True
148
+
149
+ FILTERED_COLS = [c for c in BENCHMARK_COLS if not c.startswith("[PT-N]")]
150
+ for filter in FILTERED_COLS:
151
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = False
152
+
153
  leaderboard = init_leaderboard(LEADERBOARD_DF)
154
 
155
+ with gr.TabItem("[ENG] LLMBar Adversarial (Manual)", elem_id="llm-benchmark-tab-table", id=4):
156
+ for filter in BENCHMARK_COLS:
157
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = True
158
+
159
+ FILTERED_COLS = [c for c in BENCHMARK_COLS if not c.startswith("[ENG-A]")]
160
+ for filter in FILTERED_COLS:
161
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = False
162
+
163
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
164
+
165
+ with gr.TabItem("[PT] LLMBar Adversarial (Manual)", elem_id="llm-benchmark-tab-table", id=5):
166
+ for filter in BENCHMARK_COLS:
167
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = True
168
+
169
+ FILTERED_COLS = [c for c in BENCHMARK_COLS if not c.startswith("[PT-A]")]
170
+ for filter in FILTERED_COLS:
171
+ getattr(AutoEvalColumn, task_map[filter]).displayed_by_default = False
172
+
173
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
174
+
175
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=6):
176
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
177
 
178
+ # with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
179
+ # with gr.Column():
180
+ # with gr.Row():
181
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
182
+
183
+ # with gr.Column():
184
+ # with gr.Accordion(
185
+ # f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
186
+ # open=False,
187
+ # ):
188
+ # with gr.Row():
189
+ # finished_eval_table = gr.components.Dataframe(
190
+ # value=finished_eval_queue_df,
191
+ # headers=EVAL_COLS,
192
+ # datatype=EVAL_TYPES,
193
+ # row_count=5,
194
+ # )
195
+ # with gr.Accordion(
196
+ # f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
197
+ # open=False,
198
+ # ):
199
+ # with gr.Row():
200
+ # running_eval_table = gr.components.Dataframe(
201
+ # value=running_eval_queue_df,
202
+ # headers=EVAL_COLS,
203
+ # datatype=EVAL_TYPES,
204
+ # row_count=5,
205
+ # )
206
+
207
+ # with gr.Accordion(
208
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
209
+ # open=False,
210
+ # ):
211
+ # with gr.Row():
212
+ # pending_eval_table = gr.components.Dataframe(
213
+ # value=pending_eval_queue_df,
214
+ # headers=EVAL_COLS,
215
+ # datatype=EVAL_TYPES,
216
+ # row_count=5,
217
+ # )
218
+ # with gr.Row():
219
+ # gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
220
+
221
+ # with gr.Row():
222
+ # with gr.Column():
223
+ # model_name_textbox = gr.Textbox(label="Model name")
224
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
225
+ # model_type = gr.Dropdown(
226
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
227
+ # label="Model type",
228
+ # multiselect=False,
229
+ # value=None,
230
+ # interactive=True,
231
+ # )
232
+
233
+ # with gr.Column():
234
+ # precision = gr.Dropdown(
235
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
236
+ # label="Precision",
237
+ # multiselect=False,
238
+ # value="float16",
239
+ # interactive=True,
240
+ # )
241
+ # weight_type = gr.Dropdown(
242
+ # choices=[i.value.name for i in WeightType],
243
+ # label="Weights type",
244
+ # multiselect=False,
245
+ # value="Original",
246
+ # interactive=True,
247
+ # )
248
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
249
+
250
+ # submit_button = gr.Button("Submit Eval")
251
+ # submission_result = gr.Markdown()
252
+ # submit_button.click(
253
+ # add_new_eval,
254
+ # [
255
+ # model_name_textbox,
256
+ # base_model_name_textbox,
257
+ # revision_name_textbox,
258
+ # precision,
259
+ # weight_type,
260
+ # model_type,
261
+ # ],
262
+ # submission_result,
263
+ # )
264
 
265
  with gr.Row():
266
  with gr.Accordion("πŸ“™ Citation", open=False):
src/about.py CHANGED
@@ -13,8 +13,51 @@ class Task:
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
- task0 = Task("anli_r1", "acc", "ANLI")
17
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  NUM_FEWSHOT = 0 # Change with your few shot
@@ -26,7 +69,6 @@ TITLE = """<h1 align="center" id="space-title">LLM as Judge Eval</h1>"""
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
- Intro text
30
  """
31
 
32
  # Which evaluations are you running? how can people reproduce what you have?
 
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task0 = Task("eng_natural_vanilla_acc", "acc", "[ENG-N] Vanilla Acc")
17
+ task1 = Task("eng_natural_vanilla_pa", "pa", "[ENG-N] Vanilla PA")
18
+ task2 = Task("eng_natural_metrics_reference_acc", "acc", "[ENG-N] Metrics Reference Acc")
19
+ task3 = Task("eng_natural_metrics_reference_pa", "pa", "[ENG-N] Metrics Reference PA")
20
+ task4 = Task("eng_natural_swap_acc", "acc", "[ENG-N] Swap Acc")
21
+ task5 = Task("eng_natural_swap_pa", "pa", "[ENG-N] Swap PA")
22
+ task6 = Task("eng_natural_swap_cot_acc", "acc", "[ENG-N] Swap COT Acc")
23
+ task7 = Task("eng_natural_swap_cot_pa", "pa", "[ENG-N] Swap COT PA")
24
+
25
+ task8 = Task("eng_adversarial_manual_vanilla_acc", "acc", "[ENG-A] Vanilla Acc")
26
+ task9 = Task("eng_adversarial_manual_vanilla_pa", "pa", "[ENG-A] Vanilla PA")
27
+ task10 = Task("eng_adversarial_manual_metrics_reference_acc", "acc", "[ENG-A] Metrics Reference Acc")
28
+ task11 = Task("eng_adversarial_manual_metrics_reference_pa", "pa", "[ENG-A] Metrics Reference PA")
29
+ task12 = Task("eng_adversarial_manual_swap_acc", "acc", "[ENG-A] Swap Acc")
30
+ task13 = Task("eng_adversarial_manual_swap_pa", "pa", "[ENG-A] Swap PA")
31
+ task14 = Task("eng_adversarial_manual_swap_cot_acc", "acc", "[ENG-A] Swap COT Acc")
32
+ task15 = Task("eng_adversarial_manual_swap_cot_pa", "pa", "[ENG-A] Swap COT PA")
33
+
34
+ task16 = Task("pt_natural_vanilla_acc", "acc", "[PT-N] Vanilla Acc")
35
+ task17 = Task("pt_natural_vanilla_pa", "pa", "[PT-N] Vanilla PA")
36
+ task18 = Task("pt_natural_metrics_reference_acc", "acc", "[PT-N] Metrics Reference Acc")
37
+ task19 = Task("pt_natural_metrics_reference_pa", "pa", "[PT-N] Metrics Reference PA")
38
+ task20 = Task("pt_natural_swap_acc", "acc", "[PT-N] Swap Acc")
39
+ task21 = Task("pt_natural_swap_pa", "pa", "[PT-N] Swap PA")
40
+ task22 = Task("pt_natural_swap_cot_acc", "acc", "[PT-N] Swap COT Acc")
41
+ task23 = Task("pt_natural_swap_cot_pa", "pa", "[PT-N] Swap COT PA")
42
+
43
+ task24 = Task("pt_adversarial_manual_vanilla_acc", "acc", "[PT-A] Vanilla Acc")
44
+ task25 = Task("pt_adversarial_manual_vanilla_pa", "pa", "[PT-A] Vanilla PA")
45
+ task26 = Task("pt_adversarial_manual_metrics_reference_acc", "acc", "[PT-A] Metrics Reference Acc")
46
+ task27 = Task("pt_adversarial_manual_metrics_reference_pa", "pa", "[PT-A] Metrics Reference PA")
47
+ task28 = Task("pt_adversarial_manual_swap_acc", "acc", "[PT-A] Swap Acc")
48
+ task29 = Task("pt_adversarial_manual_swap_pa", "pa", "[PT-A] Swap PA")
49
+ task30 = Task("pt_adversarial_manual_swap_cot_acc", "acc", "[PT-A] Swap COT Acc")
50
+ task31 = Task("pt_adversarial_manual_swap_cot_pa", "pa", "[PT-A] Swap COT PA")
51
+
52
+ task32 = Task("eng_pba_extraction_avg", "avg", "[ENG-P] Extraction Rate")
53
+ task33 = Task("eng_pba_positional_avg", "avg", "[ENG-P] Positional Consistency Avg")
54
+ task34 = Task("eng_pba_positional_std", "std", "[ENG-P] Positional Consistency Std")
55
+ task35 = Task("eng_pba_preference_avg", "avg", "[ENG-P] Preference Score")
56
+
57
+ task36 = Task("pt_pba_extraction_avg", "avg", "[PT-P] Extraction Rate")
58
+ task37 = Task("pt_pba_positional_avg", "avg", "[PT-P] Positional Consistency Avg")
59
+ task38 = Task("pt_pba_positional_std", "std", "[PT-P] Positional Consistency Std")
60
+ task39 = Task("pt_pba_preference_avg", "avg", "[PT-P] Preference Score")
61
 
62
 
63
  NUM_FEWSHOT = 0 # Change with your few shot
 
69
 
70
  # What does your leaderboard evaluate?
71
  INTRODUCTION_TEXT = """
 
72
  """
73
 
74
  # Which evaluations are you running? how can people reproduce what you have?
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,15 +21,16 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -43,6 +45,7 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,12 +56,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +87,13 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,6 +106,7 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
@@ -107,4 +114,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ # Scores
31
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", False)])
32
  for task in Tasks:
33
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False)])
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
45
  # We use make dataclass to dynamically fill the scores from Tasks
46
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
47
 
48
+
49
  ## For the queue columns in the submission tab
50
  @dataclass(frozen=True)
51
  class EvalQueueColumn: # Queue column
 
56
  weight_type = ColumnContent("weight_type", "str", "Original")
57
  status = ColumnContent("status", "str", True)
58
 
59
+
60
  ## All the model information that we might need
61
  @dataclass
62
  class ModelDetails:
63
  name: str
64
  display_name: str = ""
65
+ symbol: str = "" # emoji
66
 
67
 
68
  class ModelType(Enum):
 
87
  return ModelType.IFT
88
  return ModelType.Unknown
89
 
90
+
91
  class WeightType(Enum):
92
  Adapter = ModelDetails("Adapter")
93
  Original = ModelDetails("Original")
94
  Delta = ModelDetails("Delta")
95
 
96
+
97
  class Precision(Enum):
98
  float16 = ModelDetails("float16")
99
  bfloat16 = ModelDetails("bfloat16")
 
106
  return Precision.bfloat16
107
  return Precision.Unknown
108
 
109
+
110
  # Column selection
111
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
112
 
 
114
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
115
 
116
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/leaderboard/read_evals.py CHANGED
@@ -14,22 +14,22 @@ from src.submission.check_validity import is_model_on_hub
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -70,14 +70,18 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
 
 
 
 
73
 
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
 
 
 
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,10 +89,10 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -105,7 +109,9 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -139,17 +145,13 @@ def get_request_file_for_model(requests_path, model_name, precision):
139
  f"{model_name}_eval_request_*.json",
140
  )
141
  request_files = glob.glob(request_files)
142
-
143
  # Select correct request file (precision)
144
  request_file = ""
145
  request_files = sorted(request_files, reverse=True)
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
@@ -174,6 +176,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
174
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
 
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
  eval_result.update_with_request_file(requests_path)
@@ -188,9 +191,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
195
-
196
  return results
 
14
 
15
  @dataclass
16
  class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
+ revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
+ date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
 
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
+ # # We average all scores of a given metric (not all metrics are present in all files)
74
+ # accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
75
+ # if accs.size == 0 or any([acc is None for acc in accs]):
76
+ # continue
77
 
78
+ if [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark.startswith(k)][0]:
 
 
 
79
 
80
+ results[str(task.benchmark)] = [
81
+ v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark.startswith(k)
82
+ ][0] * 100.0
83
+ else:
84
+ results[str(task.benchmark)] = 0
85
 
86
  return self(
87
  eval_name=result_key,
 
89
  org=org,
90
  model=model,
91
  results=results,
92
+ precision=precision,
93
+ revision=config.get("model_sha", ""),
94
  still_on_hub=still_on_hub,
95
+ architecture=architecture,
96
  )
97
 
98
  def update_with_request_file(self, requests_path):
 
109
  self.num_params = request.get("params", 0)
110
  self.date = request.get("submitted_time", "")
111
  except Exception:
112
+ print(
113
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
114
+ )
115
 
116
  def to_dict(self):
117
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
145
  f"{model_name}_eval_request_*.json",
146
  )
147
  request_files = glob.glob(request_files)
 
148
  # Select correct request file (precision)
149
  request_file = ""
150
  request_files = sorted(request_files, reverse=True)
151
  for tmp_request_file in request_files:
152
  with open(tmp_request_file, "r") as f:
153
  req_content = json.load(f)
154
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
155
  request_file = tmp_request_file
156
  return request_file
157
 
 
176
 
177
  eval_results = {}
178
  for model_result_filepath in model_result_filepaths:
179
+
180
  # Creation of result
181
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
182
  eval_result.update_with_request_file(requests_path)
 
191
  results = []
192
  for v in eval_results.values():
193
  try:
194
+ v.to_dict() # we test if the dict version is complete
195
  results.append(v)
196
  except KeyError: # not all eval values present
197
  continue
 
198
  return results
src/populate.py CHANGED
@@ -12,13 +12,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
-
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
 
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
  df = df[cols].round(decimals=2)
 
 
 
18
  return df
19
 
20