Alina Lozovskaia commited on
Commit
f86eaae
1 Parent(s): 87e47c2

Fixing WIP

Browse files
src/display/utils.py CHANGED
@@ -1,9 +1,30 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  import json
 
 
4
  import pandas as pd
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def load_json_data(file_path):
8
  """Safely load JSON data from a file."""
9
  try:
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  import json
4
+ import logging
5
+ from datetime import datetime
6
  import pandas as pd
7
 
8
 
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ def parse_datetime(datetime_str):
13
+ formats = [
14
+ "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
15
+ "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
16
+ "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
17
+ ]
18
+
19
+ for fmt in formats:
20
+ try:
21
+ return datetime.strptime(datetime_str, fmt)
22
+ except ValueError:
23
+ continue
24
+ # in rare cases set unix start time for files with incorrect time (legacy files)
25
+ logging.error(f"No valid date format found for: {datetime_str}")
26
+ return datetime(1970, 1, 1)
27
+
28
  def load_json_data(file_path):
29
  """Safely load JSON data from a file."""
30
  try:
src/leaderboard/filter_models.py CHANGED
@@ -1,8 +1,6 @@
1
- import logging
2
  from src.display.formatting import model_hyperlink
3
  from src.display.utils import AutoEvalColumn
4
 
5
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
6
 
7
  # Models which have been flagged by users as being problematic for a reason or another
8
  # (Model name to forum discussion link)
@@ -141,7 +139,6 @@ def flag_models(leaderboard_data: list[dict]):
141
  else:
142
  flag_key = model_data[AutoEvalColumn.fullname.name]
143
  if flag_key in FLAGGED_MODELS:
144
- # logging.info(f"Flagged model: {flag_key}") # Do we need to print out the list of flagged models?
145
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
146
  issue_link = model_hyperlink(
147
  FLAGGED_MODELS[flag_key],
 
 
1
  from src.display.formatting import model_hyperlink
2
  from src.display.utils import AutoEvalColumn
3
 
 
4
 
5
  # Models which have been flagged by users as being problematic for a reason or another
6
  # (Model name to forum discussion link)
 
139
  else:
140
  flag_key = model_data[AutoEvalColumn.fullname.name]
141
  if flag_key in FLAGGED_MODELS:
 
142
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
143
  issue_link = model_hyperlink(
144
  FLAGGED_MODELS[flag_key],
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  from pathlib import Path
3
- from datetime import datetime
4
  from json import JSONDecodeError
5
  import logging
6
  import math
@@ -14,7 +13,7 @@ from tqdm.contrib.logging import logging_redirect_tqdm
14
  import numpy as np
15
 
16
  from src.display.formatting import make_clickable_model
17
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
18
 
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -54,7 +53,14 @@ class EvalResult:
54
  org_and_model = config.get("model_name", "").split("/", 1)
55
  org = org_and_model[0] if len(org_and_model) > 1 else None
56
  model = org_and_model[-1]
57
- result_key = "_".join(filter(None, [*org_and_model, precision.value.name]))
 
 
 
 
 
 
 
58
  full_model = "/".join(org_and_model)
59
 
60
  results = cls.extract_results(data) # Properly call the method to extract results
@@ -71,26 +77,39 @@ class EvalResult:
71
 
72
  @staticmethod
73
  def extract_results(data: Dict) -> Dict[str, float]:
 
 
 
 
 
 
 
 
 
 
 
74
  results = {}
75
  for task in Tasks:
76
- task_value = task.value
77
 
78
- if task_value.benchmark == "hendrycksTest":
79
- if any(data.get("versions", {}).get(mmlu_k, 1) == 0 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]):
 
 
 
 
 
 
 
 
80
  continue
81
 
82
- if task_value.benchmark == "truthfulqa:mc":
83
- task_key = "harness|truthfulqa:mc|0"
84
- if task_key in data["results"]:
85
- task_metric_value = data["results"][task_key][task_value.metric]
86
- if math.isnan(float(task_metric_value)):
87
- results[task_value.benchmark] = 0.0
88
- continue
89
 
90
- accs = [float(v.get(task_value.metric, 0)) for k, v in data["results"].items() if task_value.benchmark in k and v.get(task_value.metric, None) is not None]
91
- if accs:
92
- mean_acc = np.mean(accs) * 100.0
93
- results[task_value.benchmark] = mean_acc
94
 
95
  return results
96
 
@@ -192,23 +211,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
192
  return request_file
193
 
194
 
195
- def parse_datetime(datetime_str):
196
- formats = [
197
- "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
198
- "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
199
- "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
200
- ]
201
-
202
- for fmt in formats:
203
- try:
204
- return datetime.strptime(datetime_str, fmt)
205
- except ValueError:
206
- continue
207
- # in rare cases set unix start time for files with incorrect time (legacy files)
208
- logging.error(f"No valid date format found for: {datetime_str}")
209
- return datetime(1970, 1, 1)
210
-
211
-
212
  def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
213
  """From the path of the results folder root, extract all needed info for results"""
214
  with open(dynamic_path) as f:
@@ -246,7 +248,8 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
246
  v.to_dict() # we test if the dict version is complete
247
  results.append(v)
248
  except KeyError as e:
249
- logging.error(f"Error while checking model {k} dict, no key: {e}") # not all eval values present
250
  continue
251
 
252
- return results
 
 
1
  import json
2
  from pathlib import Path
 
3
  from json import JSONDecodeError
4
  import logging
5
  import math
 
13
  import numpy as np
14
 
15
  from src.display.formatting import make_clickable_model
16
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
 
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
53
  org_and_model = config.get("model_name", "").split("/", 1)
54
  org = org_and_model[0] if len(org_and_model) > 1 else None
55
  model = org_and_model[-1]
56
+ if len(org_and_model) == 1:
57
+ org = None
58
+ model = org_and_model[0]
59
+ result_key = f"{model}_{precision.value.name}"
60
+ else:
61
+ org = org_and_model[0]
62
+ model = org_and_model[1]
63
+ result_key = f"{org}_{model}_{precision.value.name}"
64
  full_model = "/".join(org_and_model)
65
 
66
  results = cls.extract_results(data) # Properly call the method to extract results
 
77
 
78
  @staticmethod
79
  def extract_results(data: Dict) -> Dict[str, float]:
80
+ """
81
+ Extracts and computes average scores from test result data for different benchmarks.
82
+ Skips entries based on specific conditions and handles NaN values appropriately.
83
+ Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
84
+
85
+ Parameters:
86
+ - data (Dict): Input data with 'versions' and 'results'.
87
+
88
+ Returns:
89
+ - Dict[str, float]: A dictionary with benchmark names and their computed average scores.
90
+ """
91
  results = {}
92
  for task in Tasks:
93
+ task = task.value
94
 
95
+ # We skip old mmlu entries
96
+ if task.benchmark == "hendrycksTest":
97
+ for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
98
+ if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
99
+ continue
100
+
101
+ # Some truthfulQA values are NaNs
102
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
103
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
104
+ results[task.benchmark] = 0.0
105
  continue
106
 
107
+ # We average all scores of a given metric (mostly for mmlu)
108
+ accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
109
+ if accs or any([acc is None for acc in accs]):
110
+ continue
 
 
 
111
 
112
+ results[task.benchmark] = np.mean(accs) * 100.0
 
 
 
113
 
114
  return results
115
 
 
211
  return request_file
212
 
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
215
  """From the path of the results folder root, extract all needed info for results"""
216
  with open(dynamic_path) as f:
 
248
  v.to_dict() # we test if the dict version is complete
249
  results.append(v)
250
  except KeyError as e:
251
+ logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
252
  continue
253
 
254
+ return results
255
+