Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
kennymckormick
commited on
Commit
•
044f86f
1
Parent(s):
746f6aa
update lb_info.py
Browse files- lb_info.py +18 -1
lb_info.py
CHANGED
@@ -32,7 +32,7 @@ This leaderboard was last updated: {}.
|
|
32 |
"""
|
33 |
# CONSTANTS-FIELDS
|
34 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
35 |
-
MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']
|
36 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
37 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
38 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
@@ -122,6 +122,23 @@ LEADERBOARD_MD['LLaVABench'] = """
|
|
122 |
- We also include the official results (obtained by gpt-4-0314) for applicable models.
|
123 |
"""
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
from urllib.request import urlopen
|
126 |
|
127 |
def load_results():
|
|
|
32 |
"""
|
33 |
# CONSTANTS-FIELDS
|
34 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
35 |
+
MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench', 'AI2D_TEST']
|
36 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
37 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
38 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
|
|
122 |
- We also include the official results (obtained by gpt-4-0314) for applicable models.
|
123 |
"""
|
124 |
|
125 |
+
LEADERBOARD_MD['COCO_VAL'] = """
|
126 |
+
## COCO Caption Results
|
127 |
+
|
128 |
+
- By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: `BLEU-1, BLEU-4, CIDEr, ROUGE-L
|
129 |
+
- We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
|
130 |
+
- **No specific prompt is adopted for all VLMs.**
|
131 |
+
"""
|
132 |
+
|
133 |
+
LEADERBOARD_MD['ScienceQA_VAL'] = """
|
134 |
+
# ScienceQA Evaluation Results
|
135 |
+
|
136 |
+
- We benchmark the **image** subset of ScienceQA validation and test set, and report the Top-1 accuracy.
|
137 |
+
- During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
|
138 |
+
"""
|
139 |
+
|
140 |
+
LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
|
141 |
+
|
142 |
from urllib.request import urlopen
|
143 |
|
144 |
def load_results():
|