{ "GPT_4o": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.5187898818829914, "micro_mean_score": 0.5127977300993917 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.5251654337401854, "micro_mean_score": 0.522332974147119 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2448, "macro_mean_score": 0.6478225794744895, "micro_mean_score": 0.665391229578676 }, "overall_score": 0.5409529871515315 }, "Gemini_1.5_pro_002": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.46887846869580546, "micro_mean_score": 0.46403536258864253 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.481393687771543, "micro_mean_score": 0.4756661334397647 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2448, "macro_mean_score": 0.5858190649927173, "micro_mean_score": 0.6104901117798793 }, "overall_score": 0.4948345779089219 }, "Gemini_1.5_flash_002": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.4183865592515826, "micro_mean_score": 0.41216971462683855 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.4183865592515826, "micro_mean_score": 0.41216971462683855 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2168, "macro_mean_score": 0.5691365176285039, "micro_mean_score": 0.5987532244196045 }, "overall_score": 0.4377900192406913 }, "Claude_3.5": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.4863241841253708, "micro_mean_score": 0.4798092874490549 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.5023557473841108, "micro_mean_score": 0.4985442599850241 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2288, "macro_mean_score": 0.6373907158949892, "micro_mean_score": 0.6569647463456579 }, "overall_score": 0.519736485905313 }, "GPT_4o_mini": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.3974259652331149, "micro_mean_score": 0.392578163407945 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.4070959243997505, "micro_mean_score": 0.40376078514357017 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 1224, "macro_mean_score": 0.586537827213665, "micro_mean_score": 0.6133276010318144 }, "overall_score": 0.43019240694015537 }, "Qwen2_VL_72B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.4623988230573754, "micro_mean_score": 0.4568583770401895 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.45284699372478177, "micro_mean_score": 0.4487693487093462 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2448, "macro_mean_score": 0.5639771804231668, "micro_mean_score": 0.5835339638865004 }, "overall_score": 0.4754732650945565 }, "Qwen2_VL_7B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.34725455697890745, "micro_mean_score": 0.34344091516995323 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.3284357723853296, "micro_mean_score": 0.32443422147119677 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1170, "num_total_samples": 2452, "macro_mean_score": 0.43955105763038577, "micro_mean_score": 0.45508547008546996 }, "overall_score": 0.35913430458751355 }, "llava_onevision_72B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.31960132549012704, "micro_mean_score": 0.3173848563095166 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.29725827011768174, "micro_mean_score": 0.2954433666362564 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 1224, "macro_mean_score": 0.4599484231632498, "micro_mean_score": 0.4850386930352536 }, "overall_score": 0.33766580340844976 }, "llava_onevision_7B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.2239290419841492, "micro_mean_score": 0.22222171180488767 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.21347545703998197, "micro_mean_score": 0.210586172002703 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2448, "macro_mean_score": 0.33979975321921935, "micro_mean_score": 0.36474634565778147 }, "overall_score": 0.23884309392529685 }, "InternVL2_76B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.34977582844066846, "micro_mean_score": 0.3452353155814884 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.35539585884136143, "micro_mean_score": 0.35043335903915124 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 1224, "macro_mean_score": 0.5192997443033639, "micro_mean_score": 0.5421324161650903 }, "overall_score": 0.37649239855429245 }, "InternVL2_8B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.25920867490737526, "micro_mean_score": 0.2543416126895087 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.24055897165959364, "micro_mean_score": 0.23784634936127952 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1165, "num_total_samples": 2452, "macro_mean_score": 0.3978571701460552, "micro_mean_score": 0.4108583690987125 }, "overall_score": 0.2770545208291856 }, "MiniCPM_v2.6": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.22838207666977445, "micro_mean_score": 0.22452805919103805 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.22901463640480854, "micro_mean_score": 0.2250606411323753 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2448, "macro_mean_score": 0.41728623355613875, "micro_mean_score": 0.43452278589853827 }, "overall_score": 0.25324761425596987 }, "Phi-3.5-vision": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.23240864879023493, "micro_mean_score": 0.22932978620408923 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.2295097914016776, "micro_mean_score": 0.2266573336398296 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2428, "macro_mean_score": 0.3947914647737769, "micro_mean_score": 0.42459157351676696 }, "overall_score": 0.2533094072831661 }, "Pixtral_12B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.3186510310643637, "micro_mean_score": 0.3151734861550665 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.3132232487306254, "micro_mean_score": 0.30971424472967524 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 1224, "macro_mean_score": 0.4566234428542061, "micro_mean_score": 0.4870593293207223 }, "overall_score": 0.3364098563442444 }, "Llama_3_2_11B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.10044261716549671, "micro_mean_score": 0.09980638766828835 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.15984490401619783, "micro_mean_score": 0.15794038158731832 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 1224, "macro_mean_score": 0.3173342406187366, "micro_mean_score": 0.3487962166809973 }, "overall_score": 0.1801158087274157 }, "Idefics3": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.11118980301103833, "micro_mean_score": 0.11201785633274061 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "num_total_samples": 6961, "macro_mean_score": 0.08956972487602757, "micro_mean_score": 0.08982225274252693 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "num_total_samples": 2448, "macro_mean_score": 0.3210866162255635, "micro_mean_score": 0.35649183147033553 }, "overall_score": 0.138206224513898 } }