MEGA-Bench / static /eval_results /all_model_keywords_stats.json
cccjc's picture
dump from mock space
8553d06
raw
history blame
121 kB
{
"GPT_4o_mini": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.44928744961868194
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.48842488118273475
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5152626716886682
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.4672966076116977
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.3406008235342885
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5572281917334303
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6902380952380953
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.4189154010048976
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2943206715105082
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.19422793560945503
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.4700389569079038
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.3624496929166193
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.38946844562183286
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.45508480503584553
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.47569921440672464
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.46468618797917643
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.29410984789062117
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.41174000979649644
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.38893151244736324
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.44244772638735347
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3629944944697668
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5713834131825314
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.39874839531459466
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.3359977324263039
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.4260710116168476
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.46322170353087255
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.45508480503584553
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.24651576711552803
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.3697506340557095
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5640948591986592
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2420320329702607
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.3458483931206892
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.43544861040322835
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5176671720617656
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.3554299482098288
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5398829253460956
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.32918280841495845
}
}
},
"Llama_3_2_11B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.1907604552173455
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.14280015951776653
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.1960311445935766
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.22399113135844315
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.13303760019716085
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.323153603297999
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4260501253132832
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.1770852858056774
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.15366454315378308
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06563884729522687
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.11886347847341794
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11489351406848371
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.1693681214060816
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2123769209846321
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2520175802062012
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.24806929522702081
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.06418655520777307
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.12349256529641485
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.16374180545556977
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.1576236804437753
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.15014439824913947
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3003142292328822
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.19270157739425633
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1463246409674981
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.0732004839476103
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.19579907898674231
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2123769209846321
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1351857051327849
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.18586695387250338
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.17288724679416761
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08100042975820579
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.0575426944971537
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.19853488174071646
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.254316961351997
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.162801811963855
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.28055776664538923
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.13937853323074623
}
}
},
"InternVL2_8B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2817247716997634
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.2794121858805306
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2511,
"tasks": [],
"average_score": 0.31918687243853283
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2469,
"tasks": [],
"average_score": 0.325593535916075
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.24118253695139918
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.39684007367798446
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4700852130325815
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.27052668526005397
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2439,
"tasks": [],
"average_score": 0.23189345356483618
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08260405712900723
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.2277532691786533
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2013779290163996
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.2804429603269583
},
"Videos": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.34791358240562653
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2942163420306113
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.33787327172644077
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.10933317885944857
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.24944408255581693
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.25203287826995174
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.27414636444623874
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.22381302045502052
},
"open_ended_output": {
"count": 80,
"num_samples": 1456,
"tasks": [],
"average_score": 0.3537549824897016
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.30261189962428353
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.15434618291761149
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.19814032315010577
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.30046383040641306
},
"video": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.34791358240562653
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17725087609332119
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2532272454839157
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.29096771640715396
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.12166926715781588
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.24700310231619527
},
"Perception": {
"count": 145,
"num_samples": 2315,
"tasks": [],
"average_score": 0.3205471121079154
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3995660275981844
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.24614711281861912
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3393895915929317
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.22078333222564453
}
}
},
"llava_onevision_7B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2524786809911341
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.1902376706945491
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.255069390206439
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.29981286990552625
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.18973491465938852
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.36842322314565323
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.44998746867167916
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.2445135206648208
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.21802943568344288
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06658775725427067
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.1466163383815089
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.13297395577964055
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.24236719143449742
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.30985943541023103
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3199731020402028
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3258716730180874
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.13043163858789789
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.20209776978059824
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.18285692568564196
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.25384794412815426
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.2200472229099345
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3127341248874411
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.2802999516721972
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1476473922902494
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.13787962981142515
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.25459683619676365
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.30985943541023103
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1778991941079372
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2410111891690358
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.19274192395698486
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.09846926279075068
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.15189414475467605
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.2845922887108415
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3600079950628582
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.23654776813656775
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3271805711561501
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.22080546908673507
}
}
},
"llava_onevision_72B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3615741356043519
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.282401662313336
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.36653344218973427
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.42146038539739283
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2951434804409883
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.478119286755779
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6005438596491229
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.31663222188988865
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.29633645022129285
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.13872280436872364
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.23294708136735856
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2126914943750874
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.34566020099204997
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4446001874842145
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.4401364830377099
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.42429297143518147
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.23897262553543516
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.28614732096244
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.25872873777911126
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.370724080249463
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3065719940769206
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4293132525502993
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3986052416087927
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.20730347694633405
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.27911174307216713
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3481968601113118
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4446001874842145
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.25013213032747944
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.34156793747875674
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.30653989171354723
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.18168666652660437
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.23240790940031927
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.38316803441883945
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4807891958712894
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.31702495228966576
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4358874880224115
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.31588468105075895
}
}
},
"Gemini_1.5_pro_002": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5201947642961418
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.4989864259016192
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.550842111088751
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5467324805307577
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.425969084163906
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5750369536204262
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6982330827067671
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.513647745999633
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.3845337030093212
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.23899503258223884
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.4592162957187749
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4292353723689881
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4869625906903554
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5028718355967439
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5584779204331461
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5495643443147615
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.4292127751495457
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.44828282747008336
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.44137714463131966
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5146447350354234
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4688623462674191
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5580414823700747
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5538255562099124
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.39066515495086923
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5295721925617263
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5032283218366624
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5028718355967439
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4885398161821004
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.4553778359922855
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5378983862471568
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.3335324339429373
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.43465181771633377
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.524603412718188
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5821004797173627
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5124355410095621
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5721991184410764
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.41210885517904977
}
}
},
"MiniCPM_v2.6": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2604969133146555
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.24828453993935928
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2987613496312298
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.31808788094038193
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.18281637763548025
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.4073231792632807
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.48798245614035085
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.23723675736151562
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.1968926733821904
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08735883237069725
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.21153173491931837
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.18639148159043903
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.21578309681746147
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.3527537836840162
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3096882575625531
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.31628986040092516
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.0755920550038197
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.23302306387939006
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.17775369699584467
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2551275278138797
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.20833171754655547
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.36473950920880716
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.293386806641223
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.13955971277399848
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.23499726844115643
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.2625611181730622
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.3527537836840162
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17888270664238365
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.22288678972853282
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.26614948589295767
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.11693267119342445
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.15342045420318667
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.2910511308735813
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3777897246686755
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.25714862989687987
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.33187792895542906
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.16493399805627715
}
}
},
"GPT_4o": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5628292541089482
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6173690896799526
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.6122177959113034
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5822888182775097
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.44177544539510955
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.6344814691282928
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6795263157894738
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5514924675940659
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.39435038953269674
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.22934807257231926
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.6046575685772053
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.491325251564869
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4999089647103332
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5315979872161023
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5641404607063637
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.559466820210236
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.47760591698367955
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5354190939719853
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.4780999465727382
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5994159671881645
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.44606605087301393
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.6274371950293718
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5448877153826162
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.4751133786848073
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5265640970967286
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5664191419997976
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5315979872161023
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4500928191484624
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.490800991115688
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.7011776751799048
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.33202130899313653
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5032849161169843
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5491960044393517
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6095778863474799
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5283797185155754
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.6135384893140922
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.44047720383044436
}
}
},
"Phi-3.5-vision": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2551037902226636
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.24734930136620975
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2864612416413776
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3049602749093698
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.21653804346780042
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.36823084724842464
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.46663157894736845
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.24145330077248778
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2154692063816354
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08944481289041872
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.18587661796707747
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.17497379027990792
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.26053460127801603
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.24669318645450836
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2786226802221388
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.34091066308972107
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.15444746077692828
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.21711219915973207
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.2138304528863496
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2572371188897671
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.21409351002477045
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.365192668303297
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.25960269434727634
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.12546296296296297
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.14174374624685185
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.2776898347355035
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.24669318645450836
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.20168001345379397
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2850550871176333
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.22277777000798116
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08928724806836039
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.219367263034246
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.31585879714366544
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3945898792928062
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.21925278489551242
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.33264696401038385
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.17575913004138646
}
}
},
"InternVL2_76B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.38191947207402666
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.4103649605406274
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.4341802504488193
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.42654142415639185
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2975890791763991
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5257357753421337
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5779473684210527
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.33287081421166276
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2949505390920417
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.17036496432397477
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.362195416198664
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.31396468806559114
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3473756113126343
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.395893002855977
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.44982107744035305
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.42686510293379315
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.2868239162778749
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3603288661353782
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3465926907358438
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.3943337471922549
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.29244088978470345
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.45822072478616577
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3879326330400817
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.20309901738473166
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.34490184941501867
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.41372274360003347
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.395893002855977
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.24403942809507134
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.3152784738582855
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4290949563510903
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2132321995754061
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2953329718984368
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.4201902630957567
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.47409276729986083
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.30014798153766264
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.46253164682269177
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.2868813944130515
}
}
},
"Gemini_1.5_flash_002": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.46250942866818673
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.4317914359988347
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.49775198805427967
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5098686082319499
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.34393279682972117
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5594391803821158
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6380250626566416
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.44816564352475535
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.34510790215980036
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.18973764406890803
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3836737169374586
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.3598139859097534
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4013870708864889
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4903530871753026
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5051202896842343
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5166044655846657
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.3849084036535956
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3869438864407766
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3962715194192418
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.44793686445264996
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3704146726364947
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5448638967636353
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.47829883834573317
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.33669690098261523
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.4300676062024303
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4427944359714585
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4903530871753026
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.42346517633403413
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.41994719346489817
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4627701625196691
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2517485212411566
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.40372378342017806
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.4799408254775632
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6010361821632402
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.4569546533897065
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.511590428993871
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.33710867194177685
}
}
},
"Pixtral_12B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.34602671066871027
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.3764652079852679
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.38183869685317606
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3776679463596073
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2828575553466608
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.4190587833823822
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5687919799498747
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.32813540763467464
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2677293131171651
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.10591240329992047
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.30581019415764066
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.28832738144368647
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3223299098375932
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.409643099998057
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.37450808136321684
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.37068890840142343
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.24009431093278263
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3071379066920702
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.31782992537086313
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.3639544140938305
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.32073418701669026
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4166613092238043
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3008126415966517
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.19743008314436883
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.16370884074367903
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.37086966536142313
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.409643099998057
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.2575699315401612
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.310449170121381
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4285286292013588
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.13622980866275425
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2572414987500377
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.388749951743596
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5020540387409291
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.31301986568151985
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.38094471423409354
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.24222628640267738
}
}
},
"Claude_3.5": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5405089647404562
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6026892335040172
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5695311134746034
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5450038475783499
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.4767692987630454
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5756126284078804
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6969774436090224
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5278843049497918
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.4082144793870471
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.23803578664609892
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.5637906302497772
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4795267886975966
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.525848282456283
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.508735695828719
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5699094130430454
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5078124682977725
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.4429640420975014
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5039586533964282
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.4926030136534706
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5278127103234661
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4490020843308984
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5838224169821388
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5456152399978661
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.46300075585789874
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5292494759360522
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5364554303845326
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.508735695828719
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4422556748863689
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.49311554035078103
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.6593763006847053
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.3382015835012861
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5194010220575684
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5304907166726288
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5808831682303479
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.513474611293123
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5507075880782885
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.47461998432626556
}
}
},
"Idefics3": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.14507788965553362
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.11641535161320743
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.17255583910766542
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.14745217246476708
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.1331851390883708
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.19221534222332276
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.28640852130325817
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.17906399043310475
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.10192930055370109
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.04211916597550756
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.10126271262360581
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11407926733108291
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.16225217317782772
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.16181866973635636
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.1839408679813373
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.14933801491626408
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.0395540896656236
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.13979628998424784
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.1062779093260333
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.07053056796593082
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.09790172378722654
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.2987797010800956
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.11588163814170001
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1008692365835223
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.09308121224497533
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.14757589734485796
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.16181866973635636
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.12217834249866026
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.12276246278377517
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.14743542163139847
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.05354869594691955
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.09065540194572455
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.1463280929280822
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.14564374862578883
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.22748773785486257
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.17647756032677067
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.13168972973651977
}
}
},
"Qwen2_VL_7B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.370836862933556
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.39973692484032347
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2511,
"tasks": [],
"average_score": 0.4012977216731433
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2469,
"tasks": [],
"average_score": 0.410990923097227
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2818925976996871
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.493608784197707
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5215889724310777
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.33309401517140946
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2439,
"tasks": [],
"average_score": 0.27564756843599875
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.1473690605854188
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3814353882556586
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2896392967775049
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3223325179806271
},
"Videos": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.4111189310485516
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.34825121621909577
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.4047366473438155
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.262166593895899
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3403519326516044
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3420538306638288
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.35162604166912687
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.32665673520415817
},
"open_ended_output": {
"count": 80,
"num_samples": 1456,
"tasks": [],
"average_score": 0.3909745200389741
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.39898011714302023
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.19415154950869234
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.37301502633138073
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3761693199448087
},
"video": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.4111189310485516
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.26429868057315387
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.33008667137716374
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.42660307298355216
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2003871750665659
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.3270187644950453
},
"Perception": {
"count": 145,
"num_samples": 2315,
"tasks": [],
"average_score": 0.39864841947520724
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4245693009859056
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.29880557491654197
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.42766370932167636
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.25562039051316643
}
}
},
"Qwen2_VL_72B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.49774395003470484
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.538829507114716
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.534480883952292
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5092565754998357
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.3776739609562984
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5676174603436022
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.60496992481203
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.4633019068994453
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.35105970797600183
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.2201150812944581
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.5356361790015363
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4289777675393297
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.42094543671351287
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.49943888306036405
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.507967430369507
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.495761900914191
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.36212605501536715
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.4444770652190341
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.44584364394901616
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5098505660529429
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4027115384266939
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5157810622684265
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5199940976484408
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.3100812547241119
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5364299983756791
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4908605783408196
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.49943888306036405
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.36691704884033916
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.45169664275718613
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5748195752273694
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.31245958897213383
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.4372517645050852
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5343715685033166
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4968249101570037
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.4488852456563113
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5162919233645259
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.31157492395100744
}
}
}
}