alexmarques
commited on
Commit
•
e6a4ff7
1
Parent(s):
d292013
Update README.md
Browse files
README.md
CHANGED
@@ -150,9 +150,10 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
150 |
|
151 |
### Accuracy
|
152 |
|
153 |
-
#### Open LLM Leaderboard evaluation scores
|
154 |
<table>
|
155 |
<tr>
|
|
|
|
|
156 |
<td><strong>Benchmark</strong>
|
157 |
</td>
|
158 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
@@ -163,7 +164,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
163 |
</td>
|
164 |
</tr>
|
165 |
<tr>
|
166 |
-
<td><strong>
|
|
|
|
|
167 |
</td>
|
168 |
<td>25.8 (25.1 / 26.5)
|
169 |
</td>
|
@@ -173,10 +176,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
173 |
</td>
|
174 |
</tr>
|
175 |
<tr>
|
176 |
-
<td><strong>OpenLLM v1</strong>
|
177 |
</td>
|
178 |
-
</tr>
|
179 |
-
<tr>
|
180 |
<td>MMLU (5-shot)
|
181 |
</td>
|
182 |
<td>68.3
|
@@ -257,10 +258,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
257 |
</td>
|
258 |
</tr>
|
259 |
<tr>
|
260 |
-
<td><strong>OpenLLM v2</strong>
|
261 |
</td>
|
262 |
-
</tr>
|
263 |
-
<tr>
|
264 |
<td>MMLU-Pro (5-shot)
|
265 |
</td>
|
266 |
<td>30.8
|
@@ -291,7 +290,7 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
291 |
</td>
|
292 |
</tr>
|
293 |
<tr>
|
294 |
-
<td>Math
|
295 |
</td>
|
296 |
<td>15.7
|
297 |
</td>
|
@@ -331,10 +330,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
331 |
</td>
|
332 |
</tr>
|
333 |
<tr>
|
334 |
-
<td><strong>Coding</strong>
|
335 |
</td>
|
336 |
-
</tr>
|
337 |
-
<tr>
|
338 |
<td>HumanEval pass@1
|
339 |
</td>
|
340 |
<td>67.3
|
@@ -354,8 +351,81 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
354 |
<td>98.8%
|
355 |
</td>
|
356 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
</table>
|
358 |
|
|
|
359 |
### Reproduction
|
360 |
|
361 |
The results were obtained using the following commands:
|
@@ -447,6 +517,90 @@ lm_eval \
|
|
447 |
--batch_size auto
|
448 |
```
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
#### HumanEval and HumanEval+
|
451 |
##### Generation
|
452 |
```
|
|
|
150 |
|
151 |
### Accuracy
|
152 |
|
|
|
153 |
<table>
|
154 |
<tr>
|
155 |
+
<td><strong>Category</strong>
|
156 |
+
</td>
|
157 |
<td><strong>Benchmark</strong>
|
158 |
</td>
|
159 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
|
|
164 |
</td>
|
165 |
</tr>
|
166 |
<tr>
|
167 |
+
<td rowspan="1" ><strong>LLM as a judge</strong>
|
168 |
+
</td>
|
169 |
+
<td>Arena Hard
|
170 |
</td>
|
171 |
<td>25.8 (25.1 / 26.5)
|
172 |
</td>
|
|
|
176 |
</td>
|
177 |
</tr>
|
178 |
<tr>
|
179 |
+
<td rowspan="8" ><strong>OpenLLM v1</strong>
|
180 |
</td>
|
|
|
|
|
181 |
<td>MMLU (5-shot)
|
182 |
</td>
|
183 |
<td>68.3
|
|
|
258 |
</td>
|
259 |
</tr>
|
260 |
<tr>
|
261 |
+
<td rowspan="7" ><strong>OpenLLM v2</strong>
|
262 |
</td>
|
|
|
|
|
263 |
<td>MMLU-Pro (5-shot)
|
264 |
</td>
|
265 |
<td>30.8
|
|
|
290 |
</td>
|
291 |
</tr>
|
292 |
<tr>
|
293 |
+
<td>Math-lvl-5 (4-shot)
|
294 |
</td>
|
295 |
<td>15.7
|
296 |
</td>
|
|
|
330 |
</td>
|
331 |
</tr>
|
332 |
<tr>
|
333 |
+
<td rowspan="2" ><strong>Coding</strong>
|
334 |
</td>
|
|
|
|
|
335 |
<td>HumanEval pass@1
|
336 |
</td>
|
337 |
<td>67.3
|
|
|
351 |
<td>98.8%
|
352 |
</td>
|
353 |
</tr>
|
354 |
+
<tr>
|
355 |
+
<td rowspan="9" ><strong>Multilingual</strong>
|
356 |
+
</td>
|
357 |
+
<td>Portuguese MMLU (5-shot)
|
358 |
+
</td>
|
359 |
+
<td>59.96
|
360 |
+
</td>
|
361 |
+
<td>59.36
|
362 |
+
</td>
|
363 |
+
<td>99.0%
|
364 |
+
</td>
|
365 |
+
</tr>
|
366 |
+
<tr>
|
367 |
+
<td>Spanish MMLU (5-shot)
|
368 |
+
</td>
|
369 |
+
<td>60.25
|
370 |
+
</td>
|
371 |
+
<td>59.77
|
372 |
+
</td>
|
373 |
+
<td>99.2%
|
374 |
+
</td>
|
375 |
+
</tr>
|
376 |
+
<tr>
|
377 |
+
<td>Italian MMLU (5-shot)
|
378 |
+
</td>
|
379 |
+
<td>59.23
|
380 |
+
</td>
|
381 |
+
<td>58.61
|
382 |
+
</td>
|
383 |
+
<td>99.0%
|
384 |
+
</td>
|
385 |
+
</tr>
|
386 |
+
<tr>
|
387 |
+
<td>German MMLU (5-shot)
|
388 |
+
</td>
|
389 |
+
<td>58.63
|
390 |
+
</td>
|
391 |
+
<td>58.23
|
392 |
+
</td>
|
393 |
+
<td>99.3%
|
394 |
+
</td>
|
395 |
+
</tr>
|
396 |
+
<tr>
|
397 |
+
<td>French MMLU (5-shot)
|
398 |
+
</td>
|
399 |
+
<td>59.65
|
400 |
+
</td>
|
401 |
+
<td>58.70
|
402 |
+
</td>
|
403 |
+
<td>98.4%
|
404 |
+
</td>
|
405 |
+
</tr>
|
406 |
+
<tr>
|
407 |
+
<td>Hindi MMLU (5-shot)
|
408 |
+
</td>
|
409 |
+
<td>50.10
|
410 |
+
</td>
|
411 |
+
<td>49.33
|
412 |
+
</td>
|
413 |
+
<td>98.5%
|
414 |
+
</td>
|
415 |
+
</tr>
|
416 |
+
<tr>
|
417 |
+
<td>Thai MMLU (5-shot)
|
418 |
+
</td>
|
419 |
+
<td>49.12
|
420 |
+
</td>
|
421 |
+
<td>48.09
|
422 |
+
</td>
|
423 |
+
<td>97.9%
|
424 |
+
</td>
|
425 |
+
</tr>
|
426 |
</table>
|
427 |
|
428 |
+
|
429 |
### Reproduction
|
430 |
|
431 |
The results were obtained using the following commands:
|
|
|
517 |
--batch_size auto
|
518 |
```
|
519 |
|
520 |
+
#### MMLU Portuguese
|
521 |
+
```
|
522 |
+
lm_eval \
|
523 |
+
--model vllm \
|
524 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
525 |
+
--tasks mmlu_pt_llama_3.1_instruct \
|
526 |
+
--fewshot_as_multiturn \
|
527 |
+
--apply_chat_template \
|
528 |
+
--num_fewshot 5 \
|
529 |
+
--batch_size auto
|
530 |
+
```
|
531 |
+
|
532 |
+
#### MMLU Spanish
|
533 |
+
```
|
534 |
+
lm_eval \
|
535 |
+
--model vllm \
|
536 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
537 |
+
--tasks mmlu_es_llama_3.1_instruct \
|
538 |
+
--fewshot_as_multiturn \
|
539 |
+
--apply_chat_template \
|
540 |
+
--num_fewshot 5 \
|
541 |
+
--batch_size auto
|
542 |
+
```
|
543 |
+
|
544 |
+
#### MMLU Italian
|
545 |
+
```
|
546 |
+
lm_eval \
|
547 |
+
--model vllm \
|
548 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
549 |
+
--tasks mmlu_it_llama_3.1_instruct \
|
550 |
+
--fewshot_as_multiturn \
|
551 |
+
--apply_chat_template \
|
552 |
+
--num_fewshot 5 \
|
553 |
+
--batch_size auto
|
554 |
+
```
|
555 |
+
|
556 |
+
#### MMLU German
|
557 |
+
```
|
558 |
+
lm_eval \
|
559 |
+
--model vllm \
|
560 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
561 |
+
--tasks mmlu_de_llama_3.1_instruct \
|
562 |
+
--fewshot_as_multiturn \
|
563 |
+
--apply_chat_template \
|
564 |
+
--num_fewshot 5 \
|
565 |
+
--batch_size auto
|
566 |
+
```
|
567 |
+
|
568 |
+
#### MMLU French
|
569 |
+
```
|
570 |
+
lm_eval \
|
571 |
+
--model vllm \
|
572 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
573 |
+
--tasks mmlu_fr_llama_3.1_instruct \
|
574 |
+
--fewshot_as_multiturn \
|
575 |
+
--apply_chat_template \
|
576 |
+
--num_fewshot 5 \
|
577 |
+
--batch_size auto
|
578 |
+
```
|
579 |
+
|
580 |
+
#### MMLU Hindi
|
581 |
+
```
|
582 |
+
lm_eval \
|
583 |
+
--model vllm \
|
584 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
585 |
+
--tasks mmlu_hi_llama_3.1_instruct \
|
586 |
+
--fewshot_as_multiturn \
|
587 |
+
--apply_chat_template \
|
588 |
+
--num_fewshot 5 \
|
589 |
+
--batch_size auto
|
590 |
+
```
|
591 |
+
|
592 |
+
#### MMLU Thai
|
593 |
+
```
|
594 |
+
lm_eval \
|
595 |
+
--model vllm \
|
596 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
597 |
+
--tasks mmlu_th_llama_3.1_instruct \
|
598 |
+
--fewshot_as_multiturn \
|
599 |
+
--apply_chat_template \
|
600 |
+
--num_fewshot 5 \
|
601 |
+
--batch_size auto
|
602 |
+
```
|
603 |
+
|
604 |
#### HumanEval and HumanEval+
|
605 |
##### Generation
|
606 |
```
|