djstrong commited on
Commit
c7cf816
1 Parent(s): aa391c7

description update

Browse files
src/about.py CHANGED
@@ -36,12 +36,18 @@ class Tasks(Enum):
36
  task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
37
  task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
38
  task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
39
- task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
40
  task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
41
  task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
42
  task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
 
43
  # task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
44
 
 
 
 
 
 
 
45
  NUM_FEWSHOT = 0 # Change with your few shot
46
  # ---------------------------------------------------
47
 
@@ -59,7 +65,7 @@ TITLE = """
59
  """
60
 
61
  # What does your leaderboard evaluate?
62
- INTRODUCTION_TEXT = """
63
  The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
64
  For now, models are tested without theirs templates.
65
 
@@ -67,7 +73,14 @@ Almost every task has two versions: regex and multiple choice.
67
  * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
68
  * _mc suffix means that a model is scored against every possible class (suitable also for base models)
69
 
70
- Average columns are normalized against scores by "Baseline (majority class)".
 
 
 
 
 
 
 
71
 
72
  We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
73
  """
@@ -85,7 +98,6 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
85
  * fix long model names
86
  * add inference time
87
  * add more tasks
88
- * use model templates
89
  * fix scrolling on Firefox
90
 
91
  ## Tasks
@@ -114,12 +126,15 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
114
  | polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
115
  | polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
116
  | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
 
 
 
117
 
118
  ## Reproducibility
119
  To reproduce our results, you need to clone the repository:
120
 
121
  ```
122
- git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish
123
  cd lm-evaluation-harness
124
  pip install -e .
125
  ```
@@ -127,8 +142,10 @@ pip install -e .
127
  and run benchmark for 0-shot and 5-shot:
128
 
129
  ```
130
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
131
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
 
 
132
  ```
133
 
134
  ## List of Polish models
 
36
  task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
37
  task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
38
  task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
 
39
  task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
40
  task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
41
  task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
42
+ task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
43
  # task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
44
 
45
+
46
+ g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
47
+ mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
48
+ rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
49
+ all_tasks = g_tasks + mc_tasks
50
+
51
  NUM_FEWSHOT = 0 # Change with your few shot
52
  # ---------------------------------------------------
53
 
 
65
  """
66
 
67
  # What does your leaderboard evaluate?
68
+ INTRODUCTION_TEXT = f"""
69
  The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
70
  For now, models are tested without theirs templates.
71
 
 
73
  * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
74
  * _mc suffix means that a model is scored against every possible class (suitable also for base models)
75
 
76
+ Average columns are normalized against scores by "Baseline (majority class)". Tasks taken into account while calculating averages:
77
+ * Average: {', '.join(all_tasks)}
78
+ * Avg g: {', '.join(g_tasks)}
79
+ * Avg mc: {', '.join(mc_tasks)}
80
+ * Acg RAG: {', '.join(rag_tasks)}
81
+
82
+ * `,chat` suffix means that a model is tested using chat templates
83
+ * `,chat,multiturn` suffix means that a model is tested using chat templates and fewshot examples are treated as a multi-turn conversation
84
 
85
  We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
86
  """
 
98
  * fix long model names
99
  * add inference time
100
  * add more tasks
 
101
  * fix scrolling on Firefox
102
 
103
  ## Tasks
 
126
  | polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
127
  | polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
128
  | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
129
+ | polish_poquad_open_book | enelpol/poleval2018_task3_test_10k | levenshtein | generate_until |
130
+ | polish_eq_bench_first_turn | speakleash/EQ-Bench-PL | eq_bench | generate_until |
131
+ | polish_eq_bench | speakleash/EQ-Bench-PL | eq_bench | generate_until |
132
 
133
  ## Reproducibility
134
  To reproduce our results, you need to clone the repository:
135
 
136
  ```
137
+ git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish3
138
  cd lm-evaluation-harness
139
  pip install -e .
140
  ```
 
142
  and run benchmark for 0-shot and 5-shot:
143
 
144
  ```
145
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
146
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
147
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
148
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
149
  ```
150
 
151
  ## List of Polish models
src/display/utils.py CHANGED
@@ -34,11 +34,11 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
34
  auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
35
  auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
36
  auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
 
37
 
38
  for task in Tasks:
39
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
40
  # Model information
41
- auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
42
 
43
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
44
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
34
  auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
35
  auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
36
  auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
37
+ auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
38
 
39
  for task in Tasks:
40
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
41
  # Model information
 
42
 
43
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
44
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -8,6 +8,7 @@ from dataclasses import dataclass
8
  import dateutil
9
  import numpy as np
10
 
 
11
  from src.display.formatting import make_clickable_model
12
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
13
  from src.submission.check_validity import is_model_on_hub
@@ -183,10 +184,10 @@ class EvalResult:
183
 
184
  def to_dict(self):
185
  """Converts the Eval Result to a dict compatible with our dataframe display"""
186
- g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
187
- mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
188
- rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
189
- all_tasks = g_tasks + mc_tasks
190
  all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
191
 
192
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
 
8
  import dateutil
9
  import numpy as np
10
 
11
+ from src.about import all_tasks, g_tasks, mc_tasks, rag_tasks
12
  from src.display.formatting import make_clickable_model
13
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
14
  from src.submission.check_validity import is_model_on_hub
 
184
 
185
  def to_dict(self):
186
  """Converts the Eval Result to a dict compatible with our dataframe display"""
187
+ # g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
188
+ # mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
189
+ # rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
190
+ # all_tasks = g_tasks + mc_tasks
191
  all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
192
 
193
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}