Spaces:
Restarting
Restarting
update
Browse files- README.md +2 -2
- src/about.py +7 -46
- src/leaderboard/read_evals.py +1 -2
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
+
title: Polish Medical Leaderboard
|
3 |
+
emoji: 🇵🇱🩺🏆
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
src/about.py
CHANGED
@@ -129,12 +129,7 @@ TITLE = """
|
|
129 |
|
130 |
# What does your leaderboard evaluate?
|
131 |
INTRODUCTION_TEXT = f"""
|
132 |
-
The leaderboard evaluates language models on
|
133 |
-
For now, models are tested without theirs templates.
|
134 |
-
|
135 |
-
Almost every task has two versions: regex and multiple choice.
|
136 |
-
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
137 |
-
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
138 |
|
139 |
Average columns are normalized against scores by "Baseline (majority class)".
|
140 |
|
@@ -164,43 +159,13 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
|
|
164 |
|
165 |
Tasks taken into account while calculating averages:
|
166 |
* Average: {', '.join(all_tasks)}
|
167 |
-
|
168 |
-
* Avg mc: {', '.join(mc_tasks)}
|
169 |
-
* Avg RAG: {', '.join(rag_tasks)}
|
170 |
-
|
171 |
-
| Task | Dataset | Metric | Type |
|
172 |
-
|---------------------------------|---------------------------------------|-----------|-----------------|
|
173 |
-
| polemo2_in | allegro/klej-polemo2-in | accuracy | generate_until |
|
174 |
-
| polemo2_in_mc | allegro/klej-polemo2-in | accuracy | multiple_choice |
|
175 |
-
| polemo2_out | allegro/klej-polemo2-out | accuracy | generate_until |
|
176 |
-
| polemo2_out_mc | allegro/klej-polemo2-out | accuracy | multiple_choice |
|
177 |
-
| 8tags_mc | sdadas/8tags | accuracy | multiple_choice |
|
178 |
-
| 8tags_g | sdadas/8tags | accuracy | generate_until |
|
179 |
-
| belebele_mc | facebook/belebele | accuracy | multiple_choice |
|
180 |
-
| belebele_g | facebook/belebele | accuracy | generate_until |
|
181 |
-
| dyk_mc | allegro/klej-dyk | binary F1 | multiple_choice |
|
182 |
-
| dyk_g | allegro/klej-dyk | binary F1 | generate_until |
|
183 |
-
| ppc_mc | sdadas/ppc | accuracy | multiple_choice |
|
184 |
-
| ppc_g | sdadas/ppc | accuracy | generate_until |
|
185 |
-
| psc_mc | allegro/klej-psc | binary F1 | multiple_choice |
|
186 |
-
| psc_g | allegro/klej-psc | binary F1 | generate_until |
|
187 |
-
| cbd_mc | ptaszynski/PolishCyberbullyingDataset | macro F1 | multiple_choice |
|
188 |
-
| cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
|
189 |
-
| klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
|
190 |
-
| klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
|
191 |
-
| polqa_reranking_mc | ipipan/polqa | accuracy | multiple_choice |
|
192 |
-
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
193 |
-
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
194 |
-
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
195 |
-
| polish_poquad_open_book | enelpol/poleval2018_task3_test_10k | levenshtein | generate_until |
|
196 |
-
| polish_eq_bench_first_turn | speakleash/EQ-Bench-PL | eq_bench | generate_until |
|
197 |
-
| polish_eq_bench | speakleash/EQ-Bench-PL | eq_bench | generate_until |
|
198 |
|
199 |
## Reproducibility
|
200 |
To reproduce our results, you need to clone the repository:
|
201 |
|
202 |
```
|
203 |
-
git clone https://github.com/speakleash/lm-evaluation-harness.git -b
|
204 |
cd lm-evaluation-harness
|
205 |
pip install -e .
|
206 |
```
|
@@ -208,18 +173,14 @@ pip install -e .
|
|
208 |
and run benchmark for 0-shot and 5-shot:
|
209 |
|
210 |
```
|
211 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
212 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
213 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --output_path results/ --log_samples
|
214 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --output_path results/ --log_samples
|
215 |
```
|
216 |
|
217 |
With chat templates:
|
218 |
```
|
219 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
220 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
221 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --output_path results/ --log_samples --apply_chat_template
|
222 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --output_path results/ --log_samples --apply_chat_template
|
223 |
```
|
224 |
|
225 |
## List of Polish models
|
|
|
129 |
|
130 |
# What does your leaderboard evaluate?
|
131 |
INTRODUCTION_TEXT = f"""
|
132 |
+
The leaderboard evaluates language models on Polish Board Certification Examinations (Państwowy Egzamin Specjalizacyjny) from years 2018-2022.
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
Average columns are normalized against scores by "Baseline (majority class)".
|
135 |
|
|
|
159 |
|
160 |
Tasks taken into account while calculating averages:
|
161 |
* Average: {', '.join(all_tasks)}
|
162 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
## Reproducibility
|
165 |
To reproduce our results, you need to clone the repository:
|
166 |
|
167 |
```
|
168 |
+
git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish4
|
169 |
cd lm-evaluation-harness
|
170 |
pip install -e .
|
171 |
```
|
|
|
173 |
and run benchmark for 0-shot and 5-shot:
|
174 |
|
175 |
```
|
176 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 0 --output_path results/ --log_samples
|
177 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 5 --output_path results/ --log_samples
|
|
|
|
|
178 |
```
|
179 |
|
180 |
With chat templates:
|
181 |
```
|
182 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 0 --output_path results/ --log_samples --apply_chat_template
|
183 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 5 --output_path results/ --log_samples --apply_chat_template
|
|
|
|
|
184 |
```
|
185 |
|
186 |
## List of Polish models
|
src/leaderboard/read_evals.py
CHANGED
@@ -387,6 +387,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
387 |
model_result_filepaths = []
|
388 |
|
389 |
for root, _, files in os.walk(results_path):
|
|
|
390 |
# We should only have json files in model results
|
391 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
392 |
continue
|
@@ -398,8 +399,6 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
398 |
files = [files[-1]]
|
399 |
|
400 |
for file in files:
|
401 |
-
print(file)
|
402 |
-
# if '_polish_pes_' not in file: continue
|
403 |
model_result_filepaths.append(os.path.join(root, file))
|
404 |
|
405 |
# print('PATHS:', model_result_filepaths)
|
|
|
387 |
model_result_filepaths = []
|
388 |
|
389 |
for root, _, files in os.walk(results_path):
|
390 |
+
if '_polish_pes_' not in root: continue
|
391 |
# We should only have json files in model results
|
392 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
393 |
continue
|
|
|
399 |
files = [files[-1]]
|
400 |
|
401 |
for file in files:
|
|
|
|
|
402 |
model_result_filepaths.append(os.path.join(root, file))
|
403 |
|
404 |
# print('PATHS:', model_result_filepaths)
|