Spaces:
Running
Running
Teo Wu
commited on
Commit
•
3852af6
1
Parent(s):
78810bb
update
Browse files- app.py +33 -8
- result.csv +23 -23
app.py
CHANGED
@@ -3,13 +3,28 @@ import pandas as pd
|
|
3 |
|
4 |
block = gr.Blocks(title="LongVideoBench Leaderboard", theme='gradio/soft')
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
if key in data.columns:
|
9 |
df_sorted = data.sort_values(by=key, ascending=False)
|
10 |
else:
|
11 |
df_sorted = data.sort_values(by='Test Total', ascending=False)
|
12 |
-
|
|
|
13 |
|
14 |
with block:
|
15 |
|
@@ -24,13 +39,23 @@ with block:
|
|
24 |
""")
|
25 |
|
26 |
with gr.Tab("Existing Results"):
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
return sort_data(key)
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
with gr.Tab("Submit!"):
|
36 |
gr.Markdown(
|
|
|
3 |
|
4 |
block = gr.Blocks(title="LongVideoBench Leaderboard", theme='gradio/soft')
|
5 |
|
6 |
+
# Function to sort data and filter columns based on checkboxes
|
7 |
+
def sort_data(key, show_duration, show_category):
|
8 |
+
data = pd.read_csv("result.csv")
|
9 |
+
|
10 |
+
duration_columns = ['8s-15s', '15s-60s', '180s-600s', '900s-3600s']
|
11 |
+
category_columns = ['S2E', 'S2O', 'S2A', 'E2O', 'SSS', 'SOS', 'SAA', 'T3E', 'T3O', 'TOS', 'TAA']
|
12 |
+
|
13 |
+
columns_to_show = ['Model', 'Test Total']
|
14 |
+
|
15 |
+
if show_duration:
|
16 |
+
columns_to_show += duration_columns
|
17 |
+
if show_category:
|
18 |
+
columns_to_show += category_columns
|
19 |
+
|
20 |
+
columns_to_show += ['Val Total', 'LMM Type', 'Interleaved?', "#Max Frames"]
|
21 |
+
|
22 |
if key in data.columns:
|
23 |
df_sorted = data.sort_values(by=key, ascending=False)
|
24 |
else:
|
25 |
df_sorted = data.sort_values(by='Test Total', ascending=False)
|
26 |
+
|
27 |
+
return df_sorted[columns_to_show]
|
28 |
|
29 |
with block:
|
30 |
|
|
|
39 |
""")
|
40 |
|
41 |
with gr.Tab("Existing Results"):
|
42 |
+
with gr.Row():
|
43 |
+
show_duration = gr.Checkbox(label="Show Test Set Accuracy by Duration Groups", value=False)
|
44 |
+
show_category = gr.Checkbox(label="Show Test Set Accuracy by Question Categories", value=False)
|
45 |
+
|
46 |
+
key_input = gr.Textbox(label="Rank LMMs by column:", placeholder="Test Total (default)")
|
47 |
+
|
48 |
|
49 |
+
data_frame = gr.DataFrame(sort_data('Test Total', show_duration=False, show_category=False))
|
|
|
50 |
|
51 |
+
def update_data_frame(key, show_duration, show_category):
|
52 |
+
return sort_data(key, show_duration, show_category)
|
53 |
+
|
54 |
+
key_input.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
|
55 |
+
show_duration.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
|
56 |
+
show_category.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
|
57 |
+
|
58 |
+
gr.Markdown("Models are evaluated using their optimal #max frames, capped at 256 frames.")
|
59 |
|
60 |
with gr.Tab("Submit!"):
|
61 |
gr.Markdown(
|
result.csv
CHANGED
@@ -1,23 +1,23 @@
|
|
1 |
-
Model,
|
2 |
-
GPT-4o (0513),66.7,71.6,76.8,66.7,61.6,76.8,69.8,70.9,67.3,72.8,67.2,65.3,77.2,62.6,61.3,44.3,75.6,62.6,64.0,66.4,62.1,66.4,66.7,Proprietary long-context,Yes
|
3 |
-
Gemini-1.5-Pro (0514),64.
|
4 |
-
Gemini-1.5-Flash (0514),
|
5 |
-
GPT-4-Turbo (0409),
|
6 |
-
Idefics2,49.
|
7 |
-
Phi-3-Vision-Instruct,49.
|
8 |
-
Mantis-Idefics2,47.
|
9 |
-
Mantis-BakLLaVA,43.7,51.3,52.7,41.1,40.1,53.0,38.7,44.1,46.0,51.0,50.8,43.7,50.8,45.5,40.2,23.3,48.0,44.9,40.9,38.5,34.9,47.7,43.7,Open-source long-context,Yes
|
10 |
-
LLaVA-Next-Mistral-7B,
|
11 |
-
InstructBLIP-T5-XXL,43.
|
12 |
-
BLIP-2-T5-XXL,
|
13 |
-
LLaVA-1.5-13B,43.
|
14 |
-
LLaVA-1.5-7B,40.
|
15 |
-
mPLUG-Owl2,39.
|
16 |
-
PLLaVA-34B,53.
|
17 |
-
PLLaVA-13B,45.
|
18 |
-
LLaVA-Next-Video-M7B,43.5,50.9,53.1,42.6,38.9,54.6,41.7,47.2,46.3,52.9,46.8,46.6,45.8,44.9,42.1,24.6,51.3,40.6,39.0,40.1,34.5,39.5,43.5,Video,No
|
19 |
-
ShareGPT4Video,
|
20 |
-
PLLaVA-7B,
|
21 |
-
VideoChat2 (Mistral-7B),
|
22 |
-
VideoLLaVA,
|
23 |
-
VideoChat2 (Vicuna 7B),
|
|
|
1 |
+
Model,Test Total,8s-15s,15s-60s,180s-600s,900s-3600s,S2E,S2O,S2A,E2O,O2E,T2E,T2O,T2A,E3E,O3O,SSS,SOS,SAA,T3E,T3O,TOS,TAA,Val Total,LMM Type,Interleaved?,#Max Frames
|
2 |
+
GPT-4o (0513),66.7,71.6,76.8,66.7,61.6,76.8,69.8,70.9,67.3,72.8,67.2,65.3,77.2,62.6,61.3,44.3,75.6,62.6,64.0,66.4,62.1,66.4,66.7,Proprietary long-context,Yes,256
|
3 |
+
Gemini-1.5-Pro (0514),64.4,70.2,75.3,65.0,59.1,74.6,58.3,76.2,68.7,73.3,66.2,63.6,76.7,61.9,58.6,55.2,69.0,59.0,58.9,60.5,53.3,62.5,64.0,Proprietary long-context,Yes,256
|
4 |
+
Gemini-1.5-Flash (0514),62.4,66.1,73.1,63.1,57.3,68.5,64.7,68.0,64.5,72.5,63.6,68.0,76.7,56.5,61.0,43.1,67.3,56.2,57.5,55.0,55.3,60.7,61.6,Proprietary long-context,Yes,256
|
5 |
+
GPT-4-Turbo (0409),60.7,66.4,71.1,61.7,54.5,74.9,60.1,64.2,63.9,69.4,62.5,61.3,69.9,57.5,55.9,44.8,66.0,53.2,56.5,53.6,56.2,60.2,59.1,Proprietary long-context,Yes,256
|
6 |
+
Idefics2,49.4,57.4,60.4,47.3,44.7,60.9,51.4,49.4,53.7,58.9,54.4,51.8,54.8,46.8,40.5,28.9,61.0,49.8,47.0,42.0,40.7,46.2,49.7,Open-source long-context,Yes,16
|
7 |
+
Phi-3-Vision-Instruct,49.9,58.3,59.6,48.4,45.1,60.3,52.9,53.4,51.8,54.1,52.3,55.3,53.3,49.4,47.6,33.6,59.3,46.2,44.2,43.2,38.8,51.5,49.6,Open-source long-context,Yes,16
|
8 |
+
Mantis-Idefics2,47.6,56.1,61.4,44.6,42.5,60.3,51.1,51.2,53.4,52.9,51.4,49.5,57.3,46.2,45.1,30.2,53.7,46.5,44.2,40.1,30.6,40.2,47.0,Open-source long-context,Yes,16
|
9 |
+
Mantis-BakLLaVA,43.7,51.3,52.7,41.1,40.1,53.0,38.7,44.1,46.0,51.0,50.8,43.7,50.8,45.5,40.2,23.3,48.0,44.9,40.9,38.5,34.9,47.7,43.7,Open-source long-context,Yes,16
|
10 |
+
LLaVA-Next-Mistral-7B,47.1,53.4,57.2,46.9,42.1,59.0,46.5,49.4,49.7,52.2,52.9,51.1,51.4,47.4,45.4,28.2,56.0,50.8,38.7,41.6,31.9,48.1,49.1,Image,No,8
|
11 |
+
InstructBLIP-T5-XXL,43.8,48.1,50.1,44.5,40.0,54.9,39.3,41.3,45.4,49.7,52.9,42.4,48.6,44.2,40.2,25.2,51.0,42.9,42.7,41.6,33.9,47.7,43.3,Image,No,8
|
12 |
+
BLIP-2-T5-XXL,43.5,46.7,47.4,44.2,40.9,54.6,38.1,38.8,46.3,49.0,52.6,40.2,44.3,45.2,41.2,25.6,51.3,41.6,45.1,45.1,33.6,47.4,42.7,Image,No,8
|
13 |
+
LLaVA-1.5-13B,43.1,49.0,51.1,41.8,39.6,54.9,42.6,40.4,44.8,49.0,51.1,43.1,43.0,45.2,40.9,29.9,53.3,44.2,38.7,35.6,30.0,46.2,43.4,Image,No,8
|
14 |
+
LLaVA-1.5-7B,40.4,45.0,47.4,40.1,37.0,53.3,35.0,38.8,39.6,44.9,44.1,39.9,43.3,40.7,43.9,26.2,47.3,42.9,37.2,34.7,30.3,45.1,40.3,Image,No,8
|
15 |
+
mPLUG-Owl2,39.4,49.4,47.3,38.7,34.3,49.5,37.5,37.3,39.6,45.5,45.9,41.5,39.6,44.6,36.9,24.9,45.7,38.9,30.9,36.6,33.9,38.3,39.1,Image,No,8
|
16 |
+
PLLaVA-34B,53.5,60.1,66.8,50.8,49.1,65.9,53.8,53.1,54.9,57.6,58.9,52.4,56.3,54.8,50.6,44.2,60.3,56.1,46.6,47.9,41.4,54.9,53.2,Video,No,16
|
17 |
+
PLLaVA-13B,45.1,52.9,54.3,42.9,41.2,57.1,43.5,41.9,47.3,53.5,54.4,46.9,43.7,47.1,43.6,27.2,58.0,44.2,39.6,40.1,30.9,47.0,45.6,Video,No,16
|
18 |
+
LLaVA-Next-Video-M7B,43.5,50.9,53.1,42.6,38.9,54.6,41.7,47.2,46.3,52.9,46.8,46.6,45.8,44.9,42.1,24.6,51.3,40.6,39.0,40.1,34.5,39.5,43.5,Video,No,32
|
19 |
+
ShareGPT4Video,41.8,46.9,50.1,40.0,38.7,50.2,37.5,44.4,44.2,42.7,43.8,41.2,45.8,41.7,42.7,29.9,50.3,47.2,38.7,39.7,29.3,39.8,39.7,Video,No,16
|
20 |
+
PLLaVA-7B,39.2,45.3,47.3,38.5,35.2,52.4,35.3,40.4,39.3,46.8,46.5,39.9,39.3,41.0,36.3,26.2,47.7,41.6,34.1,30.5,27.7,38.3,40.2,Video,No,16
|
21 |
+
VideoChat2 (Mistral-7B),41.2,49.3,49.3,39.0,37.5,53.6,40.8,38.5,44.5,53.5,46.8,43.1,47.7,43.6,46.6,10.6,42.0,40.6,38.4,36.3,27.4,43.6,39.3,Video,No,16
|
22 |
+
VideoLLaVA,37.6,43.1,44.6,36.4,34.4,49.5,29.6,30.6,40.9,44.9,43.5,33.8,40.6,46.5,38.7,24.3,40.0,42.9,35.1,30.5,23.8,39.5,39.1,Video,No,8
|
23 |
+
VideoChat2 (Vicuna 7B),35.1,38.1,40.5,33.5,33.6,44.8,29.0,27.3,36.9,41.7,41.7,34.1,33.1,37.2,39.6,22.6,43.0,30.7,34.1,33.8,28.3,37.2,36.0,Video,No,16
|