g8a9 commited on
Commit
56451ad
1 Parent(s): b901e2a

refactor columns

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. src/about.py +8 -3
  3. src/display/utils.py +39 -10
  4. src/leaderboard/read_evals.py +21 -7
app.py CHANGED
@@ -350,6 +350,6 @@ with demo:
350
  )
351
 
352
  scheduler = BackgroundScheduler()
353
- scheduler.add_job(restart_space, "interval", seconds=1800)
354
  scheduler.start()
355
  demo.queue(default_concurrency_limit=40).launch()
 
350
  )
351
 
352
  scheduler = BackgroundScheduler()
353
+ scheduler.add_job(restart_space, "interval", seconds=3600)
354
  scheduler.start()
355
  demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -7,6 +7,8 @@ class Task:
7
  benchmark: str
8
  metric: str
9
  col_name: str
 
 
10
 
11
 
12
  # Select your tasks here
@@ -16,11 +18,11 @@ class Tasks(Enum):
16
  task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
17
  task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
18
  task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
19
- task3 = Task("gente_rephrasing", "acc,none", "GeNTE Rephr")
20
  task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
21
  task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
22
- task6 = Task("honest_ita", "acc,none", "HONEST")
23
- task7 = Task("itacola", "mcc,none", "ItaCoLA")
24
  task8 = Task("news_sum", "bertscore,none", "News Sum")
25
  task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
26
  task10 = Task("truthfulqa_gen_ita", "rouge1_max,none", "TruthfulQA")
@@ -36,6 +38,9 @@ TITLE = """<h1 align="center" id="space-title">ItaEval leaderboard</h1>"""
36
  # What does your leaderboard evaluate?
37
  INTRODUCTION_TEXT = """
38
  This leaderboard evaluates language models on <b>ItaEval</b>, a new unified benchmark for Italian.
 
 
 
39
  """
40
 
41
  ITA_EVAL_REPO = "https://github.com/g8a9/ita-eval"
 
7
  benchmark: str
8
  metric: str
9
  col_name: str
10
+ higher_is_better: bool = True
11
+ scale_by_100: bool = True
12
 
13
 
14
  # Select your tasks here
 
18
  task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
19
  task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
20
  task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
21
+ task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
22
  task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
23
  task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
24
+ task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
25
+ task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
26
  task8 = Task("news_sum", "bertscore,none", "News Sum")
27
  task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
28
  task10 = Task("truthfulqa_gen_ita", "rouge1_max,none", "TruthfulQA")
 
38
  # What does your leaderboard evaluate?
39
  INTRODUCTION_TEXT = """
40
  This leaderboard evaluates language models on <b>ItaEval</b>, a new unified benchmark for Italian.
41
+
42
+ Some information:
43
+ - compared to other leaderboard you may found online, we do not support automatic evaluation for new model submissions
44
  """
45
 
46
  ITA_EVAL_REPO = "https://github.com/g8a9/ita-eval"
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,29 +21,36 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
 
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
 
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 
 
39
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,12 +61,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,18 +92,37 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
  float32 = ModelDetails("float32")
95
- #qt_8bit = ModelDetails("8bit")
96
- #qt_4bit = ModelDetails("4bit")
97
- #qt_GPTQ = ModelDetails("GPTQ")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
@@ -104,14 +132,15 @@ class Precision(Enum):
104
  return Precision.bfloat16
105
  if precision in ["float32"]:
106
  return Precision.float32
107
- #if precision in ["8bit"]:
108
  # return Precision.qt_8bit
109
- #if precision in ["4bit"]:
110
  # return Precision.qt_4bit
111
- #if precision in ["GPTQ", "None"]:
112
  # return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
+
28
  # Init
29
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
30
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
+
32
+ # Scores
33
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
34
  for task in Tasks:
35
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
36
+
37
  # Model information
38
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
 
39
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
40
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
41
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
42
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
43
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
44
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
45
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
46
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
47
+ auto_eval_column_dict.append(["training_codebase", ColumnContent, ColumnContent("Training code", "str", False, False)])
48
+ auto_eval_column_dict.append(["training_data", ColumnContent, ColumnContent("Training data", "str", False, False)])
49
 
50
  # We use make dataclass to dynamically fill the scores from Tasks
51
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
52
 
53
+
54
  ## For the queue columns in the submission tab
55
  @dataclass(frozen=True)
56
  class EvalQueueColumn: # Queue column
 
61
  weight_type = ColumnContent("weight_type", "str", "Original")
62
  status = ColumnContent("status", "str", True)
63
 
64
+
65
  ## All the model information that we might need
66
  @dataclass
67
  class ModelDetails:
68
  name: str
69
  display_name: str = ""
70
+ symbol: str = "" # emoji
71
 
72
 
73
  class ModelType(Enum):
 
92
  return ModelType.IFT
93
  return ModelType.Unknown
94
 
95
+
96
+ class DisclosedType(Enum):
97
+ D = ModelDetails(name="disclosed", symbol="🟢")
98
+ UD = ModelDetails(name="undisclosed", symbol="⭕")
99
+ Unknown = ModelDetails(name="", symbol="?")
100
+
101
+ def to_str(self, separator=" "):
102
+ return f"{self.value.symbol}{separator}{self.value.name}"
103
+
104
+ @staticmethod
105
+ def from_str(type):
106
+ if "disclosed" in type or "🟢" in type:
107
+ return DisclosedType.D
108
+ if "undisclosed" in type or "⭕" in type:
109
+ return DisclosedType.UD
110
+ return DisclosedType.Unknown
111
+
112
+
113
  class WeightType(Enum):
114
  Adapter = ModelDetails("Adapter")
115
  Original = ModelDetails("Original")
116
  Delta = ModelDetails("Delta")
117
 
118
+
119
  class Precision(Enum):
120
  float16 = ModelDetails("float16")
121
  bfloat16 = ModelDetails("bfloat16")
122
  float32 = ModelDetails("float32")
123
+ # qt_8bit = ModelDetails("8bit")
124
+ # qt_4bit = ModelDetails("4bit")
125
+ # qt_GPTQ = ModelDetails("GPTQ")
126
  Unknown = ModelDetails("?")
127
 
128
  def from_str(precision):
 
132
  return Precision.bfloat16
133
  if precision in ["float32"]:
134
  return Precision.float32
135
+ # if precision in ["8bit"]:
136
  # return Precision.qt_8bit
137
+ # if precision in ["4bit"]:
138
  # return Precision.qt_4bit
139
+ # if precision in ["GPTQ", "None"]:
140
  # return Precision.qt_GPTQ
141
  return Precision.Unknown
142
 
143
+
144
  # Column selection
145
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
146
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -47,8 +47,10 @@ class EvalResult:
47
  "license": config.get("license", None),
48
  "num_params": config.get("params", None),
49
  "base_model": config.get("base_model", None),
50
- "training_codebase": config.get("training_codebase", None),
51
- "training_data": config.get("training_data", None),
 
 
52
  }
53
 
54
  # Precision
@@ -83,11 +85,21 @@ class EvalResult:
83
  task = task.value
84
 
85
  # We average all scores of a given metric (not all metrics are present in all files)
86
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
 
 
 
 
 
87
  if accs.size == 0 or any([acc is None for acc in accs]):
88
  continue
89
 
90
- mean_acc = np.mean(accs) * 100.0
 
 
 
91
  results[task.benchmark] = mean_acc
92
 
93
  return self(
@@ -132,12 +144,14 @@ class EvalResult:
132
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
133
  AutoEvalColumn.architecture.name: self.architecture,
134
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
135
- AutoEvalColumn.revision.name: self.revision,
136
  AutoEvalColumn.average.name: average,
137
  AutoEvalColumn.license.name: self.license,
138
- AutoEvalColumn.likes.name: self.likes,
139
  AutoEvalColumn.params.name: self.num_params,
 
 
140
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
 
141
  }
142
 
143
  for task in Tasks:
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
47
  "license": config.get("license", None),
48
  "num_params": config.get("params", None),
49
  "base_model": config.get("base_model", None),
50
+ "model_type": ModelType.from_str(config.get("model_type", "")),
51
+ "weight_type": WeightType[config.get("weight_type", "")],
52
+ "training_codebase": DisclosedType.from_str(config.get("training_codebase", "")),
53
+ "training_data": DisclosedType.from_str(config.get("training_data", "")),
54
  }
55
 
56
  # Precision
 
85
  task = task.value
86
 
87
  # We average all scores of a given metric (not all metrics are present in all files)
88
+ accs = np.array(
89
+ [
90
+ v.get(task.metric, None) if task.higher_is_better else 1 - v.get(task.metric, None)
91
+ for k, v in data["results"].items()
92
+ if task.benchmark == k
93
+ ]
94
+ )
95
+
96
  if accs.size == 0 or any([acc is None for acc in accs]):
97
  continue
98
 
99
+ mean_acc = np.mean(accs)
100
+ if task.scale_by_100:
101
+ mean_acc *= 100.0
102
+
103
  results[task.benchmark] = mean_acc
104
 
105
  return self(
 
144
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
145
  AutoEvalColumn.architecture.name: self.architecture,
146
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
147
  AutoEvalColumn.average.name: average,
148
  AutoEvalColumn.license.name: self.license,
 
149
  AutoEvalColumn.params.name: self.num_params,
150
+ AutoEvalColumn.revision.name: self.revision,
151
+ AutoEvalColumn.likes.name: self.likes,
152
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
153
+ AutoEvalColumn.training_codebase.name: self.training_codebase.value.symbol,
154
+ AutoEvalColumn.training_data.name: self.training_data.value.symbol,
155
  }
156
 
157
  for task in Tasks: