refactor columns
Browse files- app.py +1 -1
- src/about.py +8 -3
- src/display/utils.py +39 -10
- src/leaderboard/read_evals.py +21 -7
app.py
CHANGED
@@ -350,6 +350,6 @@ with demo:
|
|
350 |
)
|
351 |
|
352 |
scheduler = BackgroundScheduler()
|
353 |
-
scheduler.add_job(restart_space, "interval", seconds=
|
354 |
scheduler.start()
|
355 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
350 |
)
|
351 |
|
352 |
scheduler = BackgroundScheduler()
|
353 |
+
scheduler.add_job(restart_space, "interval", seconds=3600)
|
354 |
scheduler.start()
|
355 |
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
@@ -7,6 +7,8 @@ class Task:
|
|
7 |
benchmark: str
|
8 |
metric: str
|
9 |
col_name: str
|
|
|
|
|
10 |
|
11 |
|
12 |
# Select your tasks here
|
@@ -16,11 +18,11 @@ class Tasks(Enum):
|
|
16 |
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
|
17 |
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
|
18 |
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
|
19 |
-
task3 = Task("gente_rephrasing", "acc,none", "GeNTE
|
20 |
task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
|
21 |
task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
|
22 |
-
task6 = Task("honest_ita", "acc,none", "HONEST")
|
23 |
-
task7 = Task("itacola", "mcc,none", "ItaCoLA")
|
24 |
task8 = Task("news_sum", "bertscore,none", "News Sum")
|
25 |
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
|
26 |
task10 = Task("truthfulqa_gen_ita", "rouge1_max,none", "TruthfulQA")
|
@@ -36,6 +38,9 @@ TITLE = """<h1 align="center" id="space-title">ItaEval leaderboard</h1>"""
|
|
36 |
# What does your leaderboard evaluate?
|
37 |
INTRODUCTION_TEXT = """
|
38 |
This leaderboard evaluates language models on <b>ItaEval</b>, a new unified benchmark for Italian.
|
|
|
|
|
|
|
39 |
"""
|
40 |
|
41 |
ITA_EVAL_REPO = "https://github.com/g8a9/ita-eval"
|
|
|
7 |
benchmark: str
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
+
higher_is_better: bool = True
|
11 |
+
scale_by_100: bool = True
|
12 |
|
13 |
|
14 |
# Select your tasks here
|
|
|
18 |
task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
|
19 |
task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
|
20 |
task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
|
21 |
+
task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
|
22 |
task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
|
23 |
task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
|
24 |
+
task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
|
25 |
+
task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
|
26 |
task8 = Task("news_sum", "bertscore,none", "News Sum")
|
27 |
task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
|
28 |
task10 = Task("truthfulqa_gen_ita", "rouge1_max,none", "TruthfulQA")
|
|
|
38 |
# What does your leaderboard evaluate?
|
39 |
INTRODUCTION_TEXT = """
|
40 |
This leaderboard evaluates language models on <b>ItaEval</b>, a new unified benchmark for Italian.
|
41 |
+
|
42 |
+
Some information:
|
43 |
+
- compared to other leaderboard you may found online, we do not support automatic evaluation for new model submissions
|
44 |
"""
|
45 |
|
46 |
ITA_EVAL_REPO = "https://github.com/g8a9/ita-eval"
|
src/display/utils.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
|
6 |
from src.about import Tasks
|
7 |
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
@@ -20,29 +21,36 @@ class ColumnContent:
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
|
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
-
|
|
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
|
|
|
|
39 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
|
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
|
|
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
@@ -53,12 +61,13 @@ class EvalQueueColumn: # Queue column
|
|
53 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
|
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
58 |
class ModelDetails:
|
59 |
name: str
|
60 |
display_name: str = ""
|
61 |
-
symbol: str = ""
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
@@ -83,18 +92,37 @@ class ModelType(Enum):
|
|
83 |
return ModelType.IFT
|
84 |
return ModelType.Unknown
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
class WeightType(Enum):
|
87 |
Adapter = ModelDetails("Adapter")
|
88 |
Original = ModelDetails("Original")
|
89 |
Delta = ModelDetails("Delta")
|
90 |
|
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
94 |
float32 = ModelDetails("float32")
|
95 |
-
#qt_8bit = ModelDetails("8bit")
|
96 |
-
#qt_4bit = ModelDetails("4bit")
|
97 |
-
#qt_GPTQ = ModelDetails("GPTQ")
|
98 |
Unknown = ModelDetails("?")
|
99 |
|
100 |
def from_str(precision):
|
@@ -104,14 +132,15 @@ class Precision(Enum):
|
|
104 |
return Precision.bfloat16
|
105 |
if precision in ["float32"]:
|
106 |
return Precision.float32
|
107 |
-
#if precision in ["8bit"]:
|
108 |
# return Precision.qt_8bit
|
109 |
-
#if precision in ["4bit"]:
|
110 |
# return Precision.qt_4bit
|
111 |
-
#if precision in ["GPTQ", "None"]:
|
112 |
# return Precision.qt_GPTQ
|
113 |
return Precision.Unknown
|
114 |
|
|
|
115 |
# Column selection
|
116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
117 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
5 |
|
6 |
from src.about import Tasks
|
7 |
|
8 |
+
|
9 |
def fields(raw_class):
|
10 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
11 |
|
|
|
21 |
hidden: bool = False
|
22 |
never_hidden: bool = False
|
23 |
|
24 |
+
|
25 |
## Leaderboard columns
|
26 |
auto_eval_column_dict = []
|
27 |
+
|
28 |
# Init
|
29 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
+
|
32 |
+
# Scores
|
33 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
34 |
for task in Tasks:
|
35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
36 |
+
|
37 |
# Model information
|
38 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
|
|
39 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
40 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
41 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
42 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
43 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
44 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
45 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
46 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
47 |
+
auto_eval_column_dict.append(["training_codebase", ColumnContent, ColumnContent("Training code", "str", False, False)])
|
48 |
+
auto_eval_column_dict.append(["training_data", ColumnContent, ColumnContent("Training data", "str", False, False)])
|
49 |
|
50 |
# We use make dataclass to dynamically fill the scores from Tasks
|
51 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
52 |
|
53 |
+
|
54 |
## For the queue columns in the submission tab
|
55 |
@dataclass(frozen=True)
|
56 |
class EvalQueueColumn: # Queue column
|
|
|
61 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
62 |
status = ColumnContent("status", "str", True)
|
63 |
|
64 |
+
|
65 |
## All the model information that we might need
|
66 |
@dataclass
|
67 |
class ModelDetails:
|
68 |
name: str
|
69 |
display_name: str = ""
|
70 |
+
symbol: str = "" # emoji
|
71 |
|
72 |
|
73 |
class ModelType(Enum):
|
|
|
92 |
return ModelType.IFT
|
93 |
return ModelType.Unknown
|
94 |
|
95 |
+
|
96 |
+
class DisclosedType(Enum):
|
97 |
+
D = ModelDetails(name="disclosed", symbol="🟢")
|
98 |
+
UD = ModelDetails(name="undisclosed", symbol="⭕")
|
99 |
+
Unknown = ModelDetails(name="", symbol="?")
|
100 |
+
|
101 |
+
def to_str(self, separator=" "):
|
102 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
103 |
+
|
104 |
+
@staticmethod
|
105 |
+
def from_str(type):
|
106 |
+
if "disclosed" in type or "🟢" in type:
|
107 |
+
return DisclosedType.D
|
108 |
+
if "undisclosed" in type or "⭕" in type:
|
109 |
+
return DisclosedType.UD
|
110 |
+
return DisclosedType.Unknown
|
111 |
+
|
112 |
+
|
113 |
class WeightType(Enum):
|
114 |
Adapter = ModelDetails("Adapter")
|
115 |
Original = ModelDetails("Original")
|
116 |
Delta = ModelDetails("Delta")
|
117 |
|
118 |
+
|
119 |
class Precision(Enum):
|
120 |
float16 = ModelDetails("float16")
|
121 |
bfloat16 = ModelDetails("bfloat16")
|
122 |
float32 = ModelDetails("float32")
|
123 |
+
# qt_8bit = ModelDetails("8bit")
|
124 |
+
# qt_4bit = ModelDetails("4bit")
|
125 |
+
# qt_GPTQ = ModelDetails("GPTQ")
|
126 |
Unknown = ModelDetails("?")
|
127 |
|
128 |
def from_str(precision):
|
|
|
132 |
return Precision.bfloat16
|
133 |
if precision in ["float32"]:
|
134 |
return Precision.float32
|
135 |
+
# if precision in ["8bit"]:
|
136 |
# return Precision.qt_8bit
|
137 |
+
# if precision in ["4bit"]:
|
138 |
# return Precision.qt_4bit
|
139 |
+
# if precision in ["GPTQ", "None"]:
|
140 |
# return Precision.qt_GPTQ
|
141 |
return Precision.Unknown
|
142 |
|
143 |
+
|
144 |
# Column selection
|
145 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
146 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -47,8 +47,10 @@ class EvalResult:
|
|
47 |
"license": config.get("license", None),
|
48 |
"num_params": config.get("params", None),
|
49 |
"base_model": config.get("base_model", None),
|
50 |
-
"
|
51 |
-
"
|
|
|
|
|
52 |
}
|
53 |
|
54 |
# Precision
|
@@ -83,11 +85,21 @@ class EvalResult:
|
|
83 |
task = task.value
|
84 |
|
85 |
# We average all scores of a given metric (not all metrics are present in all files)
|
86 |
-
accs = np.array(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
88 |
continue
|
89 |
|
90 |
-
mean_acc = np.mean(accs)
|
|
|
|
|
|
|
91 |
results[task.benchmark] = mean_acc
|
92 |
|
93 |
return self(
|
@@ -132,12 +144,14 @@ class EvalResult:
|
|
132 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
133 |
AutoEvalColumn.architecture.name: self.architecture,
|
134 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
135 |
-
AutoEvalColumn.revision.name: self.revision,
|
136 |
AutoEvalColumn.average.name: average,
|
137 |
AutoEvalColumn.license.name: self.license,
|
138 |
-
AutoEvalColumn.likes.name: self.likes,
|
139 |
AutoEvalColumn.params.name: self.num_params,
|
|
|
|
|
140 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
|
|
141 |
}
|
142 |
|
143 |
for task in Tasks:
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
47 |
"license": config.get("license", None),
|
48 |
"num_params": config.get("params", None),
|
49 |
"base_model": config.get("base_model", None),
|
50 |
+
"model_type": ModelType.from_str(config.get("model_type", "")),
|
51 |
+
"weight_type": WeightType[config.get("weight_type", "")],
|
52 |
+
"training_codebase": DisclosedType.from_str(config.get("training_codebase", "")),
|
53 |
+
"training_data": DisclosedType.from_str(config.get("training_data", "")),
|
54 |
}
|
55 |
|
56 |
# Precision
|
|
|
85 |
task = task.value
|
86 |
|
87 |
# We average all scores of a given metric (not all metrics are present in all files)
|
88 |
+
accs = np.array(
|
89 |
+
[
|
90 |
+
v.get(task.metric, None) if task.higher_is_better else 1 - v.get(task.metric, None)
|
91 |
+
for k, v in data["results"].items()
|
92 |
+
if task.benchmark == k
|
93 |
+
]
|
94 |
+
)
|
95 |
+
|
96 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
97 |
continue
|
98 |
|
99 |
+
mean_acc = np.mean(accs)
|
100 |
+
if task.scale_by_100:
|
101 |
+
mean_acc *= 100.0
|
102 |
+
|
103 |
results[task.benchmark] = mean_acc
|
104 |
|
105 |
return self(
|
|
|
144 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
145 |
AutoEvalColumn.architecture.name: self.architecture,
|
146 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
|
147 |
AutoEvalColumn.average.name: average,
|
148 |
AutoEvalColumn.license.name: self.license,
|
|
|
149 |
AutoEvalColumn.params.name: self.num_params,
|
150 |
+
AutoEvalColumn.revision.name: self.revision,
|
151 |
+
AutoEvalColumn.likes.name: self.likes,
|
152 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
153 |
+
AutoEvalColumn.training_codebase.name: self.training_codebase.value.symbol,
|
154 |
+
AutoEvalColumn.training_data.name: self.training_data.value.symbol,
|
155 |
}
|
156 |
|
157 |
for task in Tasks:
|