Update README.md
Browse files
README.md
CHANGED
@@ -207,4 +207,108 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
|
|
207 |
|MATH Lvl 5 (4-Shot)| 6.04|
|
208 |
|GPQA (0-shot) |15.21|
|
209 |
|MuSR (0-shot) |13.61|
|
210 |
-
|MMLU-PRO (5-shot) |46.85|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|MATH Lvl 5 (4-Shot)| 6.04|
|
208 |
|GPQA (0-shot) |15.21|
|
209 |
|MuSR (0-shot) |13.61|
|
210 |
+
|MMLU-PRO (5-shot) |46.85|
|
211 |
+
|
212 |
+
|
213 |
+
| Key | 72b Result | 95b Result | Difference | Which is Higher | Multiplier |
|
214 |
+
|:--------------------------------------------------------------------------|-------------:|-------------:|-------------:|:------------------|:-------------|
|
215 |
+
| leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02x |
|
216 |
+
| leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | 0.016 | 72b | 0.98x |
|
217 |
+
| leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | 0.012 | 72b | 0.99x |
|
218 |
+
| leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00x |
|
219 |
+
| leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | 0.011 | 72b | 0.97x |
|
220 |
+
| leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
|
221 |
+
| leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
|
222 |
+
| leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
|
223 |
+
| leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
|
224 |
+
| leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
|
225 |
+
| leaderboard.acc_norm,none | 0.641 | 0.622 | 0.02 | 72b | 0.97x |
|
226 |
+
| leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
|
227 |
+
| leaderboard.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
|
228 |
+
| leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | 0.005 | 72b | 0.99x |
|
229 |
+
| leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | 0.08 | 72b | 0.88x |
|
230 |
+
| leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | 0.007 | 72b | 0.98x |
|
231 |
+
| leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33x |
|
232 |
+
| leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83x |
|
233 |
+
| leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | 0.016 | 72b | 0.98x |
|
234 |
+
| leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00x |
|
235 |
+
| leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02x |
|
236 |
+
| leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00x |
|
237 |
+
| leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00x |
|
238 |
+
| leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
|
239 |
+
| leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
|
240 |
+
| leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
|
241 |
+
| leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
|
242 |
+
| leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | 0.022 | 72b | 0.98x |
|
243 |
+
| leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | 0.06 | 72b | 0.91x |
|
244 |
+
| leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07x |
|
245 |
+
| leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01x |
|
246 |
+
| leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | 0.068 | 72b | 0.89x |
|
247 |
+
| leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11x |
|
248 |
+
| leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | 0.112 | 72b | 0.84x |
|
249 |
+
| leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | 0.06 | 72b | 0.87x |
|
250 |
+
| leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | 0.044 | 72b | 0.95x |
|
251 |
+
| leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | 0.036 | 72b | 0.89x |
|
252 |
+
| leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | 0.02 | 72b | 0.97x |
|
253 |
+
| all.leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02x |
|
254 |
+
| all.leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | 0.016 | 72b | 0.98x |
|
255 |
+
| all.leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | 0.012 | 72b | 0.99x |
|
256 |
+
| all.leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00x |
|
257 |
+
| all.leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | 0.011 | 72b | 0.97x |
|
258 |
+
| all.leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
|
259 |
+
| all.leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
|
260 |
+
| all.leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
|
261 |
+
| all.leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
|
262 |
+
| all.leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
|
263 |
+
| all.leaderboard.acc_norm,none | 0.641 | 0.622 | 0.02 | 72b | 0.97x |
|
264 |
+
| all.leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
|
265 |
+
| all.leaderboard.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
|
266 |
+
| all.leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | 0.005 | 72b | 0.99x |
|
267 |
+
| all.leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | 0.08 | 72b | 0.88x |
|
268 |
+
| all.leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | 0.007 | 72b | 0.98x |
|
269 |
+
| all.leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33x |
|
270 |
+
| all.leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83x |
|
271 |
+
| all.leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | 0.016 | 72b | 0.98x |
|
272 |
+
| all.leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00x |
|
273 |
+
| all.leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02x |
|
274 |
+
| all.leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00x |
|
275 |
+
| all.leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00x |
|
276 |
+
| all.leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
|
277 |
+
| all.leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
|
278 |
+
| all.leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
|
279 |
+
| all.leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
|
280 |
+
| all.leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | 0.022 | 72b | 0.98x |
|
281 |
+
| all.leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | 0.06 | 72b | 0.91x |
|
282 |
+
| all.leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07x |
|
283 |
+
| all.leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01x |
|
284 |
+
| all.leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | 0.068 | 72b | 0.89x |
|
285 |
+
| all.leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11x |
|
286 |
+
| all.leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | 0.112 | 72b | 0.84x |
|
287 |
+
| all.leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | 0.06 | 72b | 0.87x |
|
288 |
+
| all.leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | 0.044 | 72b | 0.95x |
|
289 |
+
| all.leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | 0.036 | 72b | 0.89x |
|
290 |
+
| all.leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | 0.02 | 72b | 0.97x |
|
291 |
+
| all.leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | 0.02 | 72b | 0.95x |
|
292 |
+
| all.leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04x |
|
293 |
+
| all.leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | 0.036 | 72b | 0.89x |
|
294 |
+
| all.leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | 0.02 | 72b | 0.97x |
|
295 |
+
| all.leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | 0.052 | 72b | 0.89x |
|
296 |
+
| all.leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | 0.024 | 72b | 0.97x |
|
297 |
+
| all.leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10x |
|
298 |
+
| all.leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | 0.028 | 72b | 0.97x |
|
299 |
+
| all.leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.02 | 95b | 1.03x |
|
300 |
+
| all.leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | 0.025 | 72b | 0.97x |
|
301 |
+
| all.leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | 0.028 | 72b | 0.97x |
|
302 |
+
| all.leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
|
303 |
+
| leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | 0.02 | 72b | 0.95x |
|
304 |
+
| leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04x |
|
305 |
+
| leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | 0.036 | 72b | 0.89x |
|
306 |
+
| leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | 0.02 | 72b | 0.97x |
|
307 |
+
| leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | 0.052 | 72b | 0.89x |
|
308 |
+
| leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | 0.024 | 72b | 0.97x |
|
309 |
+
| leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10x |
|
310 |
+
| leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | 0.028 | 72b | 0.97x |
|
311 |
+
| leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.02 | 95b | 1.03x |
|
312 |
+
| leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | 0.025 | 72b | 0.97x |
|
313 |
+
| leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | 0.028 | 72b | 0.97x |
|
314 |
+
| leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
|