kennymckormick
commited on
Commit
•
f65a20c
1
Parent(s):
edf09c4
update meta_data
Browse files- meta_data.py +22 -6
meta_data.py
CHANGED
@@ -21,8 +21,8 @@ This leaderboard was last updated: {}.
|
|
21 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
22 |
MAIN_FIELDS = [
|
23 |
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
24 |
-
'MMMU_VAL', 'MathVista', 'HallusionBench', 'AI2D',
|
25 |
-
'
|
26 |
]
|
27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
@@ -31,12 +31,21 @@ MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
|
31 |
# The README file for each benchmark
|
32 |
LEADERBOARD_MD = {}
|
33 |
|
34 |
-
LEADERBOARD_MD['MAIN'] = """
|
35 |
## Main Evaluation Results
|
36 |
|
37 |
-
-
|
38 |
-
- Avg
|
39 |
-
- The
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"""
|
41 |
|
42 |
LEADERBOARD_MD['SEEDBench_IMG'] = """
|
@@ -134,4 +143,11 @@ LEADERBOARD_MD['OCRBench'] = """
|
|
134 |
|
135 |
- The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
|
136 |
- The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
"""
|
|
|
21 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
22 |
MAIN_FIELDS = [
|
23 |
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
24 |
+
'MMMU_VAL', 'MathVista', 'HallusionBench', 'AI2D',
|
25 |
+
'OCRBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
|
26 |
]
|
27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
|
|
31 |
# The README file for each benchmark
|
32 |
LEADERBOARD_MD = {}
|
33 |
|
34 |
+
LEADERBOARD_MD['MAIN'] = f"""
|
35 |
## Main Evaluation Results
|
36 |
|
37 |
+
- Metrics:
|
38 |
+
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
|
39 |
+
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
|
40 |
+
- The overall evaluation results on {len(MAIN_FIELDS)} VLM benchmarks, sorted by the ascending order of Avg Rank.
|
41 |
+
- The following datasets are included in the main results: {' '.join(MAIN_FIELDS)}. The detailed evaluation results for each dataset are provided in the consequent tabs.
|
42 |
+
"""
|
43 |
+
|
44 |
+
for dataset in ['MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'CCBench']:
|
45 |
+
LEADERBOARD_MD[dataset] = f"""
|
46 |
+
## {dataset.replace('_', ' ')} Evaluation Results
|
47 |
+
|
48 |
+
- We adopt Circular Eval for benchmarks in MMBench series, you can check https://arxiv.org/pdf/2307.06281.pdf for the detailed definition of Circular Eval.
|
49 |
"""
|
50 |
|
51 |
LEADERBOARD_MD['SEEDBench_IMG'] = """
|
|
|
143 |
|
144 |
- The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
|
145 |
- The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
|
146 |
+
"""
|
147 |
+
|
148 |
+
LEADERBOARD_MD['MMStar'] = """
|
149 |
+
## MMStar Evaluation Results
|
150 |
+
|
151 |
+
- MMStar is an elite vision-indispensable multi-modal benchmark, including 1,500 challenging samples meticulously selected by humans.
|
152 |
+
- During the evaluation of MMStar, we find that some API models may reject to answer some of the questions. Currently, we treat such cases as wrong answers when reporting the results.
|
153 |
"""
|