File size: 4,316 Bytes
256a159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from mmengine.config import read_base

with read_base():
    from .models.qwen.hf_qwen_7b_chat import models
    from .datasets.collections.leaderboard.qwen_chat import datasets
    from .summarizers.leaderboard import summarizer

'''
dataset                                 version    metric            mode    qwen-7b-chat-hf
--------------------------------------  ---------  ----------------  ------  -----------------
--------- 考试 Exam ---------           -          -                 -       -
ceval                                   -          naive_average     gen     56.07
agieval                                 -          naive_average     mixed   39.51
mmlu                                    -          naive_average     gen     53.49
cmmlu                                   -          naive_average     gen     55.29
GaokaoBench                             -          weighted_average  gen     48.01
ARC-c                                   ca1e8e     accuracy          ppl     74.92
ARC-e                                   ca1e8e     accuracy          ppl     85.71
--------- 语言 Language ---------       -          -                 -       -
WiC                                     efbd01     accuracy          gen     51.41
chid-dev                                25f3d3     accuracy          ppl     77.72
afqmc-dev                               4a1636     accuracy          gen     69.00
WSC                                     678cb5     accuracy          ppl     67.31
tydiqa-goldp                            -          naive_average     gen     15.32
flores_100                              -          naive_average     gen     10.00
--------- 知识 Knowledge ---------      -          -                 -       -
BoolQ                                   463fee     accuracy          ppl     83.18
commonsense_qa                          ddaabf     accuracy          gen     76.41
triviaqa                                b6904f     score             gen     43.25
nq                                      23dc1a     score             gen     16.26
--------- 理解 Understanding ---------  -          -                 -       -
C3                                      e6778d     accuracy          gen     81.53
race-middle                             e0908b     accuracy          gen     83.01
race-high                               e0908b     accuracy          gen     77.79
openbookqa_fact                         49689a     accuracy          ppl     86.40
csl_dev                                 3c4211     accuracy          ppl     64.38
lcsts                                   0b3969     rouge1            gen     12.75
Xsum                                    207e69     rouge1            gen     20.21
eprstmt-dev                             ed0c5d     accuracy          ppl     85.00
lambada                                 de1af2     accuracy          gen     59.19
--------- 推理 Reasoning ---------      -          -                 -       -
cmnli                                   15e783     accuracy          ppl     48.08
ocnli                                   15e783     accuracy          ppl     51.40
AX_b                                    689df1     accuracy          ppl     65.67
AX_g                                    808a19     accuracy          ppl     76.12
RTE                                     808a19     accuracy          ppl     68.95
COPA                                    59f42c     accuracy          gen     92.00
ReCoRD                                  6f7cfc     score             gen     0.16
hellaswag                               8d79e0     accuracy          ppl     69.28
piqa                                    34eee7     accuracy          ppl     72.20
siqa                                    ea30d1     accuracy          ppl     72.88
math                                    2c0b9e     accuracy          gen     7.84
gsm8k                                   4c7f6e     accuracy          gen     45.41
drop                                    53a0a7     score             gen     39.62
openai_humaneval                        dd0dff     humaneval_pass@1  gen     10.98
mbpp                                    60ca11     score             gen     20.60
bbh                                     -          naive_average     gen     42.61
'''