update code with MLVU and TempCompass
Browse files- app.py +2 -2
- lb_info.py +42 -8
app.py
CHANGED
@@ -8,8 +8,8 @@ with gr.Blocks() as demo:
|
|
8 |
EVAL_TIME = format_timestamp(timestamp)
|
9 |
results = struct['results']
|
10 |
N_MODEL = len(results)
|
11 |
-
N_DATA = len(results['Video-LLaVA']) - 1
|
12 |
-
DATASETS = list(results['Video-LLaVA'])
|
13 |
DATASETS.remove('META')
|
14 |
print(DATASETS)
|
15 |
|
|
|
8 |
EVAL_TIME = format_timestamp(timestamp)
|
9 |
results = struct['results']
|
10 |
N_MODEL = len(results)
|
11 |
+
N_DATA = len(results['Video-LLaVA-7B']) - 1
|
12 |
+
DATASETS = list(results['Video-LLaVA-7B'])
|
13 |
DATASETS.remove('META')
|
14 |
print(DATASETS)
|
15 |
|
lb_info.py
CHANGED
@@ -36,7 +36,7 @@ This leaderboard was last updated: {}.
|
|
36 |
"""
|
37 |
# CONSTANTS-FIELDS
|
38 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Frames']
|
39 |
-
MAIN_FIELDS = ['MVBench', 'Video-MME (w/o subs)', 'MMBench-Video']
|
40 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
41 |
MODEL_TYPE = ['API', 'OpenSource']
|
42 |
|
@@ -134,13 +134,41 @@ def BUILD_L1_DF(results, fields):
|
|
134 |
res[k].append(meta[k])
|
135 |
scores, ranks = [], []
|
136 |
for d in fields:
|
137 |
-
|
138 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
if d == 'MMBench-Video':
|
140 |
scores.append(item[d]['Overall'] / 3 * 100)
|
|
|
|
|
|
|
|
|
141 |
else:
|
142 |
scores.append(item[d]['Overall'])
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
res['Avg Score'].append(round(np.mean(scores), 1))
|
145 |
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
146 |
|
@@ -160,13 +188,13 @@ def BUILD_L1_DF(results, fields):
|
|
160 |
def BUILD_L2_DF(results, dataset):
|
161 |
res = defaultdict(list)
|
162 |
fields = list(list(results.values())[0][dataset].keys())
|
163 |
-
non_overall_fields = [x for x in fields if 'Overall' not in x]
|
164 |
-
overall_fields = [x for x in fields if 'Overall' in x]
|
165 |
|
166 |
for m in results:
|
167 |
item = results[m]
|
168 |
meta = item['META']
|
169 |
-
if item[dataset] == {}:
|
170 |
continue
|
171 |
for k in META_FIELDS:
|
172 |
if k == 'Parameters (B)':
|
@@ -186,7 +214,12 @@ def BUILD_L2_DF(results, dataset):
|
|
186 |
res[d].append(item[dataset][d])
|
187 |
|
188 |
df = pd.DataFrame(res)
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
190 |
df = df.iloc[::-1]
|
191 |
|
192 |
check_box = {}
|
@@ -202,4 +235,5 @@ def BUILD_L2_DF(results, dataset):
|
|
202 |
type_map['Method'] = 'html'
|
203 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = type_map['Frames'] ='str'
|
204 |
check_box['type_map'] = type_map
|
|
|
205 |
return df, check_box
|
|
|
36 |
"""
|
37 |
# CONSTANTS-FIELDS
|
38 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Frames']
|
39 |
+
MAIN_FIELDS = ['MVBench', 'Video-MME (w/o subs)', 'MMBench-Video', 'TempCompass', 'MLVU']
|
40 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
41 |
MODEL_TYPE = ['API', 'OpenSource']
|
42 |
|
|
|
134 |
res[k].append(meta[k])
|
135 |
scores, ranks = [], []
|
136 |
for d in fields:
|
137 |
+
# if d == 'MLVU':
|
138 |
+
# item[d]['Overall'] = item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16
|
139 |
+
# elif d == 'TempCompass':
|
140 |
+
# item[d]['Overall'] = item[d]['overall']
|
141 |
+
if d == 'MLVU':
|
142 |
+
res[d].append(
|
143 |
+
f'M-Avg: {item[d]["M-Avg"]}, G-Avg: {item[d]["G-Avg"]}'
|
144 |
+
# {
|
145 |
+
# 'M-Avg': item[d]['M-Avg'],
|
146 |
+
# 'G-Avg': item[d]['G-Avg']
|
147 |
+
# }
|
148 |
+
)
|
149 |
+
elif d == 'TempCompass':
|
150 |
+
res[d].append(item[d]['overall'])
|
151 |
+
else:
|
152 |
+
res[d].append(item[d]['Overall'])
|
153 |
+
|
154 |
if d == 'MMBench-Video':
|
155 |
scores.append(item[d]['Overall'] / 3 * 100)
|
156 |
+
elif d == 'TempCompass':
|
157 |
+
scores.append(item[d]['overall'])
|
158 |
+
elif d == 'MLVU':
|
159 |
+
scores.append(item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16)
|
160 |
else:
|
161 |
scores.append(item[d]['Overall'])
|
162 |
+
|
163 |
+
if d == 'MLVU':
|
164 |
+
ranks.append(nth_large(
|
165 |
+
item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16,
|
166 |
+
[x[d]['M-Avg'] * 0.84 + x[d]['G-Avg'] * 10 * 0.16 for x in results.values() if d in x and 'M-Avg' in x[d] and 'G-Avg' in x[d]]
|
167 |
+
))
|
168 |
+
elif d == 'TempCompass':
|
169 |
+
ranks.append(nth_large(item[d]['overall'], [x[d]['overall'] for x in results.values() if d in x and 'overall' in x[d]]))
|
170 |
+
else:
|
171 |
+
ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values() if d in x and 'Overall' in x[d]]))
|
172 |
res['Avg Score'].append(round(np.mean(scores), 1))
|
173 |
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
174 |
|
|
|
188 |
def BUILD_L2_DF(results, dataset):
|
189 |
res = defaultdict(list)
|
190 |
fields = list(list(results.values())[0][dataset].keys())
|
191 |
+
non_overall_fields = [x for x in fields if 'Overall' not in x and 'Avg' not in x and 'overall' not in x]
|
192 |
+
overall_fields = [x for x in fields if 'Overall' in x or 'Avg' in x or 'overall' in x]
|
193 |
|
194 |
for m in results:
|
195 |
item = results[m]
|
196 |
meta = item['META']
|
197 |
+
if dataset not in item or item[dataset] == {}:
|
198 |
continue
|
199 |
for k in META_FIELDS:
|
200 |
if k == 'Parameters (B)':
|
|
|
214 |
res[d].append(item[dataset][d])
|
215 |
|
216 |
df = pd.DataFrame(res)
|
217 |
+
if dataset == 'MLVU':
|
218 |
+
df = df.sort_values('M-Avg')
|
219 |
+
elif dataset == 'TempCompass':
|
220 |
+
df = df.sort_values('overall')
|
221 |
+
else:
|
222 |
+
df = df.sort_values('Overall')
|
223 |
df = df.iloc[::-1]
|
224 |
|
225 |
check_box = {}
|
|
|
235 |
type_map['Method'] = 'html'
|
236 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = type_map['Frames'] ='str'
|
237 |
check_box['type_map'] = type_map
|
238 |
+
# print(check_box, dataset, df.columns)
|
239 |
return df, check_box
|