Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Sean Cho
commited on
Commit
β’
eb3c07b
1
Parent(s):
c1068ee
Add two new tasks
Browse files- src/display/about.py +3 -1
- src/display/utils.py +2 -0
- src/leaderboard/read_evals.py +15 -1
- src/tools/plots.py +6 -1
src/display/about.py
CHANGED
@@ -33,11 +33,13 @@ Please provide information about the model through an issue! π€©
|
|
33 |
|
34 |
π We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
35 |
|
36 |
-
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the
|
37 |
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
38 |
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
39 |
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
40 |
- Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
|
|
|
|
41 |
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
42 |
|
43 |
To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, and common sense. The final score is converted to the average score from each evaluation datasets.
|
|
|
33 |
|
34 |
π We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
35 |
|
36 |
+
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by HuggingFace OpenLLM. We have also added a new dataset prepared from scratch.
|
37 |
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
38 |
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
39 |
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
40 |
- Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
41 |
+
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
42 |
+
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
43 |
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
44 |
|
45 |
To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, and common sense. The final score is converted to the average score from each evaluation datasets.
|
src/display/utils.py
CHANGED
@@ -18,6 +18,8 @@ class Tasks(Enum):
|
|
18 |
hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
|
19 |
mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
|
20 |
truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
|
|
|
|
|
21 |
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
22 |
|
23 |
# These classes are for user facing column names,
|
|
|
18 |
hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
|
19 |
mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
|
20 |
truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
|
21 |
+
winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
|
22 |
+
gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
|
23 |
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
24 |
|
25 |
# These classes are for user facing column names,
|
src/leaderboard/read_evals.py
CHANGED
@@ -103,6 +103,11 @@ class EvalResult:
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
|
|
|
|
|
|
|
|
|
|
106 |
# We average all scores of a given metric (mostly for mmlu)
|
107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
@@ -143,7 +148,16 @@ class EvalResult:
|
|
143 |
|
144 |
def to_dict(self):
|
145 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
data_dict = {
|
148 |
"eval_name": self.eval_name, # not a column, just a save name,
|
149 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
106 |
+
# Two new tasks have been added, we need to skip them for now
|
107 |
+
if task.benchmark == "ko_winogrande" or task.benchmark == "ko_gsm8k":
|
108 |
+
results[task.benchmark] = 0.0
|
109 |
+
continue
|
110 |
+
|
111 |
# We average all scores of a given metric (mostly for mmlu)
|
112 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
113 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
148 |
|
149 |
def to_dict(self):
|
150 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
151 |
+
|
152 |
+
# Skip the two new tasks for now
|
153 |
+
# TODO: safely remove this code when the task results are added
|
154 |
+
skip_avg_len = 0
|
155 |
+
if self.results['ko_winogrande'] == 0.0:
|
156 |
+
skip_avg_len += 1
|
157 |
+
if self.results['ko_gsm8k'] == 0.0:
|
158 |
+
skip_avg_len += 1
|
159 |
+
|
160 |
+
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
161 |
data_dict = {
|
162 |
"eval_name": self.eval_name, # not a column, just a save name,
|
163 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/tools/plots.py
CHANGED
@@ -36,7 +36,12 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
36 |
|
37 |
current_date = row["date"]
|
38 |
if task.benchmark == "Average":
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
else:
|
41 |
current_score = row["results"][task.benchmark]
|
42 |
|
|
|
36 |
|
37 |
current_date = row["date"]
|
38 |
if task.benchmark == "Average":
|
39 |
+
avg_skip_len = 0
|
40 |
+
if row["results"]["ko_winogrande"] == 0.0:
|
41 |
+
avg_skip_len += 1
|
42 |
+
if row["results"]["ko_gsm8k"] == 0.0:
|
43 |
+
avg_skip_len += 1
|
44 |
+
current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
|
45 |
else:
|
46 |
current_score = row["results"][task.benchmark]
|
47 |
|