MEGA-Bench / static /eval_results /all_summary.json
cccjc's picture
dump from mock space
8553d06
raw
history blame
14.3 kB
{
"GPT_4o": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.5187898818829914,
"micro_mean_score": 0.5127977300993917
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.5251654337401854,
"micro_mean_score": 0.522332974147119
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2448,
"macro_mean_score": 0.6478225794744895,
"micro_mean_score": 0.665391229578676
},
"overall_score": 0.5409529871515315
},
"Gemini_1.5_pro_002": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.46887846869580546,
"micro_mean_score": 0.46403536258864253
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.481393687771543,
"micro_mean_score": 0.4756661334397647
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2448,
"macro_mean_score": 0.5858190649927173,
"micro_mean_score": 0.6104901117798793
},
"overall_score": 0.4948345779089219
},
"Gemini_1.5_flash_002": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.4183865592515826,
"micro_mean_score": 0.41216971462683855
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.4183865592515826,
"micro_mean_score": 0.41216971462683855
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2168,
"macro_mean_score": 0.5691365176285039,
"micro_mean_score": 0.5987532244196045
},
"overall_score": 0.4377900192406913
},
"Claude_3.5": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.4863241841253708,
"micro_mean_score": 0.4798092874490549
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.5023557473841108,
"micro_mean_score": 0.4985442599850241
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2288,
"macro_mean_score": 0.6373907158949892,
"micro_mean_score": 0.6569647463456579
},
"overall_score": 0.519736485905313
},
"GPT_4o_mini": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.3974259652331149,
"micro_mean_score": 0.392578163407945
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.4070959243997505,
"micro_mean_score": 0.40376078514357017
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 1224,
"macro_mean_score": 0.586537827213665,
"micro_mean_score": 0.6133276010318144
},
"overall_score": 0.43019240694015537
},
"Qwen2_VL_72B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.4623988230573754,
"micro_mean_score": 0.4568583770401895
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.45284699372478177,
"micro_mean_score": 0.4487693487093462
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2448,
"macro_mean_score": 0.5639771804231668,
"micro_mean_score": 0.5835339638865004
},
"overall_score": 0.4754732650945565
},
"Qwen2_VL_7B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.34725455697890745,
"micro_mean_score": 0.34344091516995323
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.3284357723853296,
"micro_mean_score": 0.32443422147119677
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1170,
"num_total_samples": 2452,
"macro_mean_score": 0.43955105763038577,
"micro_mean_score": 0.45508547008546996
},
"overall_score": 0.35913430458751355
},
"llava_onevision_72B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.31960132549012704,
"micro_mean_score": 0.3173848563095166
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.29725827011768174,
"micro_mean_score": 0.2954433666362564
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 1224,
"macro_mean_score": 0.4599484231632498,
"micro_mean_score": 0.4850386930352536
},
"overall_score": 0.33766580340844976
},
"llava_onevision_7B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.2239290419841492,
"micro_mean_score": 0.22222171180488767
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.21347545703998197,
"micro_mean_score": 0.210586172002703
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2448,
"macro_mean_score": 0.33979975321921935,
"micro_mean_score": 0.36474634565778147
},
"overall_score": 0.23884309392529685
},
"InternVL2_76B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.34977582844066846,
"micro_mean_score": 0.3452353155814884
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.35539585884136143,
"micro_mean_score": 0.35043335903915124
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 1224,
"macro_mean_score": 0.5192997443033639,
"micro_mean_score": 0.5421324161650903
},
"overall_score": 0.37649239855429245
},
"InternVL2_8B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.25920867490737526,
"micro_mean_score": 0.2543416126895087
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.24055897165959364,
"micro_mean_score": 0.23784634936127952
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1165,
"num_total_samples": 2452,
"macro_mean_score": 0.3978571701460552,
"micro_mean_score": 0.4108583690987125
},
"overall_score": 0.2770545208291856
},
"MiniCPM_v2.6": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.22838207666977445,
"micro_mean_score": 0.22452805919103805
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.22901463640480854,
"micro_mean_score": 0.2250606411323753
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2448,
"macro_mean_score": 0.41728623355613875,
"micro_mean_score": 0.43452278589853827
},
"overall_score": 0.25324761425596987
},
"Phi-3.5-vision": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.23240864879023493,
"micro_mean_score": 0.22932978620408923
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.2295097914016776,
"micro_mean_score": 0.2266573336398296
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2428,
"macro_mean_score": 0.3947914647737769,
"micro_mean_score": 0.42459157351676696
},
"overall_score": 0.2533094072831661
},
"Pixtral_12B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.3186510310643637,
"micro_mean_score": 0.3151734861550665
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.3132232487306254,
"micro_mean_score": 0.30971424472967524
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 1224,
"macro_mean_score": 0.4566234428542061,
"micro_mean_score": 0.4870593293207223
},
"overall_score": 0.3364098563442444
},
"Llama_3_2_11B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.10044261716549671,
"micro_mean_score": 0.09980638766828835
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.15984490401619783,
"micro_mean_score": 0.15794038158731832
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 1224,
"macro_mean_score": 0.3173342406187366,
"micro_mean_score": 0.3487962166809973
},
"overall_score": 0.1801158087274157
},
"Idefics3": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.11118980301103833,
"micro_mean_score": 0.11201785633274061
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"num_total_samples": 6961,
"macro_mean_score": 0.08956972487602757,
"micro_mean_score": 0.08982225274252693
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"num_total_samples": 2448,
"macro_mean_score": 0.3210866162255635,
"micro_mean_score": 0.35649183147033553
},
"overall_score": 0.138206224513898
}
}