yzabc007 commited on
Commit
ee84fd2
β€’
1 Parent(s): efb5f5d

Update space

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +6 -1
  3. src/leaderboard/read_evals.py +72 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Decentralized Arena
3
  emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
 
1
  ---
2
+ title: Decentralized Arena Leaderboard
3
  emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
app.py CHANGED
@@ -122,7 +122,12 @@ with demo:
122
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
123
 
124
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
125
- with gr.TabItem("πŸ… Overall", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
 
 
126
  leaderboard = overall_leaderboard(LEADERBOARD_DF)
127
 
128
  with gr.TabItem("πŸ”’ Math", elem_id="math-tab-table", id=1):
 
122
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
123
 
124
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
125
+
126
+ with gr.TabItem("πŸ… Overview", elem_id="llm-benchmark-tab-table", id=0):
127
+ leaderboard = overall_leaderboard(LEADERBOARD_DF)
128
+
129
+
130
+ with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=0):
131
  leaderboard = overall_leaderboard(LEADERBOARD_DF)
132
 
133
  with gr.TabItem("πŸ”’ Math", elem_id="math-tab-table", id=1):
src/leaderboard/read_evals.py CHANGED
@@ -12,6 +12,50 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  @dataclass
16
  class EvalResult:
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
@@ -195,3 +239,31 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
195
 
196
  return results
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
+ @dataclass
16
+ class ModelResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
+ """
19
+ eval_name: str
20
+ full_model: str
21
+
22
+ @classmethod
23
+ def init_from_jsonl_file(self, json_filepath):
24
+ try:
25
+ with open(json_filepath) as fp:
26
+ data = json.load(fp)
27
+ except:
28
+ data = eval(open(json_filepath).read()) # a list of dicts
29
+
30
+
31
+
32
+
33
+ return
34
+
35
+ def to_dict(self):
36
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
37
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
38
+ data_dict = {
39
+ "eval_name": self.eval_name, # not a column, just a save name,
40
+ AutoEvalColumn.precision.name: self.precision.value.name,
41
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
42
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
43
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
44
+ AutoEvalColumn.architecture.name: self.architecture,
45
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
46
+ AutoEvalColumn.revision.name: self.revision,
47
+ AutoEvalColumn.average.name: average,
48
+ AutoEvalColumn.license.name: self.license,
49
+ AutoEvalColumn.likes.name: self.likes,
50
+ AutoEvalColumn.params.name: self.num_params,
51
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
52
+ }
53
+
54
+ for task in Tasks:
55
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
56
+
57
+ return data_dict
58
+
59
  @dataclass
60
  class EvalResult:
61
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
 
239
 
240
  return results
241
 
242
+
243
+ def get_raw_model_results(results_path: str) -> list[EvalResult]:
244
+ """From the path of the results folder root, extract all needed info for results"""
245
+ model_result_filepaths = results_path
246
+
247
+ eval_results = {}
248
+
249
+ for model_result_filepath in model_result_filepaths:
250
+ # Creation of result
251
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
252
+
253
+ # Store results of same eval together
254
+ eval_name = eval_result.eval_name
255
+ if eval_name in eval_results.keys():
256
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
257
+ else:
258
+ eval_results[eval_name] = eval_result
259
+
260
+ results = []
261
+ for v in eval_results.values():
262
+ try:
263
+ v.to_dict() # we test if the dict version is complete
264
+ results.append(v)
265
+ except KeyError: # not all eval values present
266
+ continue
267
+
268
+ return results
269
+