Teo Wu commited on
Commit
3852af6
1 Parent(s): 78810bb
Files changed (2) hide show
  1. app.py +33 -8
  2. result.csv +23 -23
app.py CHANGED
@@ -3,13 +3,28 @@ import pandas as pd
3
 
4
  block = gr.Blocks(title="LongVideoBench Leaderboard", theme='gradio/soft')
5
 
6
- def sort_data(key):
7
- data = pd.read_csv("result.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  if key in data.columns:
9
  df_sorted = data.sort_values(by=key, ascending=False)
10
  else:
11
  df_sorted = data.sort_values(by='Test Total', ascending=False)
12
- return df_sorted
 
13
 
14
  with block:
15
 
@@ -24,13 +39,23 @@ with block:
24
  """)
25
 
26
  with gr.Tab("Existing Results"):
27
- key_input = gr.Textbox(label="Sort by subset (default: Test Total)", placeholder="Enter column name to sort by")
28
- data_frame = gr.DataFrame(sort_data('Test Total'))
 
 
 
 
29
 
30
- def update_data_frame(key):
31
- return sort_data(key)
32
 
33
- key_input.change(update_data_frame, inputs=key_input, outputs=data_frame)
 
 
 
 
 
 
 
34
 
35
  with gr.Tab("Submit!"):
36
  gr.Markdown(
 
3
 
4
  block = gr.Blocks(title="LongVideoBench Leaderboard", theme='gradio/soft')
5
 
6
+ # Function to sort data and filter columns based on checkboxes
7
+ def sort_data(key, show_duration, show_category):
8
+ data = pd.read_csv("result.csv")
9
+
10
+ duration_columns = ['8s-15s', '15s-60s', '180s-600s', '900s-3600s']
11
+ category_columns = ['S2E', 'S2O', 'S2A', 'E2O', 'SSS', 'SOS', 'SAA', 'T3E', 'T3O', 'TOS', 'TAA']
12
+
13
+ columns_to_show = ['Model', 'Test Total']
14
+
15
+ if show_duration:
16
+ columns_to_show += duration_columns
17
+ if show_category:
18
+ columns_to_show += category_columns
19
+
20
+ columns_to_show += ['Val Total', 'LMM Type', 'Interleaved?', "#Max Frames"]
21
+
22
  if key in data.columns:
23
  df_sorted = data.sort_values(by=key, ascending=False)
24
  else:
25
  df_sorted = data.sort_values(by='Test Total', ascending=False)
26
+
27
+ return df_sorted[columns_to_show]
28
 
29
  with block:
30
 
 
39
  """)
40
 
41
  with gr.Tab("Existing Results"):
42
+ with gr.Row():
43
+ show_duration = gr.Checkbox(label="Show Test Set Accuracy by Duration Groups", value=False)
44
+ show_category = gr.Checkbox(label="Show Test Set Accuracy by Question Categories", value=False)
45
+
46
+ key_input = gr.Textbox(label="Rank LMMs by column:", placeholder="Test Total (default)")
47
+
48
 
49
+ data_frame = gr.DataFrame(sort_data('Test Total', show_duration=False, show_category=False))
 
50
 
51
+ def update_data_frame(key, show_duration, show_category):
52
+ return sort_data(key, show_duration, show_category)
53
+
54
+ key_input.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
55
+ show_duration.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
56
+ show_category.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
57
+
58
+ gr.Markdown("Models are evaluated using their optimal #max frames, capped at 256 frames.")
59
 
60
  with gr.Tab("Submit!"):
61
  gr.Markdown(
result.csv CHANGED
@@ -1,23 +1,23 @@
1
- Model,Val Total,8s-15s,15s-60s,180s-600s,900s-3600s,S2E,S2O,S2A,E2O,O2E,T2E,T2O,T2A,E3E,O3O,SSS,SOS,SAA,T3E,T3O,TOS,TAA,Test Total,LMM Type,Interleaved?
2
- GPT-4o (0513),66.7,71.6,76.8,66.7,61.6,76.8,69.8,70.9,67.3,72.8,67.2,65.3,77.2,62.6,61.3,44.3,75.6,62.6,64.0,66.4,62.1,66.4,66.7,Proprietary long-context,Yes
3
- Gemini-1.5-Pro (0514),64.0,70.2,75.3,65.0,59.1,74.6,58.3,76.2,68.7,73.3,66.2,63.6,76.7,61.9,58.6,55.2,69.0,59.0,58.9,60.5,53.3,62.5,64.4,Proprietary long-context,Yes
4
- Gemini-1.5-Flash (0514),61.6,66.1,73.1,63.1,57.3,68.5,64.7,68.0,64.5,72.5,63.6,68.0,76.7,56.5,61.0,43.1,67.3,56.2,57.5,55.0,55.3,60.7,62.4,Proprietary long-context,Yes
5
- GPT-4-Turbo (0409),59.1,66.4,71.1,61.7,54.5,74.9,60.1,64.2,63.9,69.4,62.5,61.3,69.9,57.5,55.9,44.8,66.0,53.2,56.5,53.6,56.2,60.2,60.7,Proprietary long-context,Yes
6
- Idefics2,49.7,57.4,60.4,47.3,44.7,60.9,51.4,49.4,53.7,58.9,54.4,51.8,54.8,46.8,40.5,28.9,61.0,49.8,47.0,42.0,40.7,46.2,49.4,Open-source long-context,Yes
7
- Phi-3-Vision-Instruct,49.6,58.3,59.6,48.4,45.1,60.3,52.9,53.4,51.8,54.1,52.3,55.3,53.3,49.4,47.6,33.6,59.3,46.2,44.2,43.2,38.8,51.5,49.9,Open-source long-context,Yes
8
- Mantis-Idefics2,47.0,56.1,61.4,44.6,42.5,60.3,51.1,51.2,53.4,52.9,51.4,49.5,57.3,46.2,45.1,30.2,53.7,46.5,44.2,40.1,30.6,40.2,47.6,Open-source long-context,Yes
9
- Mantis-BakLLaVA,43.7,51.3,52.7,41.1,40.1,53.0,38.7,44.1,46.0,51.0,50.8,43.7,50.8,45.5,40.2,23.3,48.0,44.9,40.9,38.5,34.9,47.7,43.7,Open-source long-context,Yes
10
- LLaVA-Next-Mistral-7B,49.1,53.4,57.2,46.9,42.1,59.0,46.5,49.4,49.7,52.2,52.9,51.1,51.4,47.4,45.4,28.2,56.0,50.8,38.7,41.6,31.9,48.1,47.1,Image,No
11
- InstructBLIP-T5-XXL,43.3,48.1,50.1,44.5,40.0,54.9,39.3,41.3,45.4,49.7,52.9,42.4,48.6,44.2,40.2,25.2,51.0,42.9,42.7,41.6,33.9,47.7,43.8,Image,No
12
- BLIP-2-T5-XXL,42.7,46.7,47.4,44.2,40.9,54.6,38.1,38.8,46.3,49.0,52.6,40.2,44.3,45.2,41.2,25.6,51.3,41.6,45.1,45.1,33.6,47.4,43.5,Image,No
13
- LLaVA-1.5-13B,43.4,49.0,51.1,41.8,39.6,54.9,42.6,40.4,44.8,49.0,51.1,43.1,43.0,45.2,40.9,29.9,53.3,44.2,38.7,35.6,30.0,46.2,43.1,Image,No
14
- LLaVA-1.5-7B,40.3,45.0,47.4,40.1,37.0,53.3,35.0,38.8,39.6,44.9,44.1,39.9,43.3,40.7,43.9,26.2,47.3,42.9,37.2,34.7,30.3,45.1,40.4,Image,No
15
- mPLUG-Owl2,39.1,49.4,47.3,38.7,34.3,49.5,37.5,37.3,39.6,45.5,45.9,41.5,39.6,44.6,36.9,24.9,45.7,38.9,30.9,36.6,33.9,38.3,39.4,Image,No
16
- PLLaVA-34B,53.2,60.1,66.8,50.8,49.1,65.9,53.8,53.1,54.9,57.6,58.9,52.4,56.3,54.8,50.6,44.2,60.3,56.1,46.6,47.9,41.4,54.9,53.5,Video,No
17
- PLLaVA-13B,45.6,52.9,54.3,42.9,41.2,57.1,43.5,41.9,47.3,53.5,54.4,46.9,43.7,47.1,43.6,27.2,58.0,44.2,39.6,40.1,30.9,47.0,45.1,Video,No
18
- LLaVA-Next-Video-M7B,43.5,50.9,53.1,42.6,38.9,54.6,41.7,47.2,46.3,52.9,46.8,46.6,45.8,44.9,42.1,24.6,51.3,40.6,39.0,40.1,34.5,39.5,43.5,Video,No
19
- ShareGPT4Video,39.7,46.9,50.1,40.0,38.7,50.2,37.5,44.4,44.2,42.7,43.8,41.2,45.8,41.7,42.7,29.9,50.3,47.2,38.7,39.7,29.3,39.8,41.8,Video,No
20
- PLLaVA-7B,40.2,45.3,47.3,38.5,35.2,52.4,35.3,40.4,39.3,46.8,46.5,39.9,39.3,41.0,36.3,26.2,47.7,41.6,34.1,30.5,27.7,38.3,39.2,Video,No
21
- VideoChat2 (Mistral-7B),39.3,49.3,49.3,39.0,37.5,53.6,40.8,38.5,44.5,53.5,46.8,43.1,47.7,43.6,46.6,10.6,42.0,40.6,38.4,36.3,27.4,43.6,41.2,Video,No
22
- VideoLLaVA,39.1,43.1,44.6,36.4,34.4,49.5,29.6,30.6,40.9,44.9,43.5,33.8,40.6,46.5,38.7,24.3,40.0,42.9,35.1,30.5,23.8,39.5,37.6,Video,No
23
- VideoChat2 (Vicuna 7B),36.0,38.1,40.5,33.5,33.6,44.8,29.0,27.3,36.9,41.7,41.7,34.1,33.1,37.2,39.6,22.6,43.0,30.7,34.1,33.8,28.3,37.2,35.1,Video,No
 
1
+ Model,Test Total,8s-15s,15s-60s,180s-600s,900s-3600s,S2E,S2O,S2A,E2O,O2E,T2E,T2O,T2A,E3E,O3O,SSS,SOS,SAA,T3E,T3O,TOS,TAA,Val Total,LMM Type,Interleaved?,#Max Frames
2
+ GPT-4o (0513),66.7,71.6,76.8,66.7,61.6,76.8,69.8,70.9,67.3,72.8,67.2,65.3,77.2,62.6,61.3,44.3,75.6,62.6,64.0,66.4,62.1,66.4,66.7,Proprietary long-context,Yes,256
3
+ Gemini-1.5-Pro (0514),64.4,70.2,75.3,65.0,59.1,74.6,58.3,76.2,68.7,73.3,66.2,63.6,76.7,61.9,58.6,55.2,69.0,59.0,58.9,60.5,53.3,62.5,64.0,Proprietary long-context,Yes,256
4
+ Gemini-1.5-Flash (0514),62.4,66.1,73.1,63.1,57.3,68.5,64.7,68.0,64.5,72.5,63.6,68.0,76.7,56.5,61.0,43.1,67.3,56.2,57.5,55.0,55.3,60.7,61.6,Proprietary long-context,Yes,256
5
+ GPT-4-Turbo (0409),60.7,66.4,71.1,61.7,54.5,74.9,60.1,64.2,63.9,69.4,62.5,61.3,69.9,57.5,55.9,44.8,66.0,53.2,56.5,53.6,56.2,60.2,59.1,Proprietary long-context,Yes,256
6
+ Idefics2,49.4,57.4,60.4,47.3,44.7,60.9,51.4,49.4,53.7,58.9,54.4,51.8,54.8,46.8,40.5,28.9,61.0,49.8,47.0,42.0,40.7,46.2,49.7,Open-source long-context,Yes,16
7
+ Phi-3-Vision-Instruct,49.9,58.3,59.6,48.4,45.1,60.3,52.9,53.4,51.8,54.1,52.3,55.3,53.3,49.4,47.6,33.6,59.3,46.2,44.2,43.2,38.8,51.5,49.6,Open-source long-context,Yes,16
8
+ Mantis-Idefics2,47.6,56.1,61.4,44.6,42.5,60.3,51.1,51.2,53.4,52.9,51.4,49.5,57.3,46.2,45.1,30.2,53.7,46.5,44.2,40.1,30.6,40.2,47.0,Open-source long-context,Yes,16
9
+ Mantis-BakLLaVA,43.7,51.3,52.7,41.1,40.1,53.0,38.7,44.1,46.0,51.0,50.8,43.7,50.8,45.5,40.2,23.3,48.0,44.9,40.9,38.5,34.9,47.7,43.7,Open-source long-context,Yes,16
10
+ LLaVA-Next-Mistral-7B,47.1,53.4,57.2,46.9,42.1,59.0,46.5,49.4,49.7,52.2,52.9,51.1,51.4,47.4,45.4,28.2,56.0,50.8,38.7,41.6,31.9,48.1,49.1,Image,No,8
11
+ InstructBLIP-T5-XXL,43.8,48.1,50.1,44.5,40.0,54.9,39.3,41.3,45.4,49.7,52.9,42.4,48.6,44.2,40.2,25.2,51.0,42.9,42.7,41.6,33.9,47.7,43.3,Image,No,8
12
+ BLIP-2-T5-XXL,43.5,46.7,47.4,44.2,40.9,54.6,38.1,38.8,46.3,49.0,52.6,40.2,44.3,45.2,41.2,25.6,51.3,41.6,45.1,45.1,33.6,47.4,42.7,Image,No,8
13
+ LLaVA-1.5-13B,43.1,49.0,51.1,41.8,39.6,54.9,42.6,40.4,44.8,49.0,51.1,43.1,43.0,45.2,40.9,29.9,53.3,44.2,38.7,35.6,30.0,46.2,43.4,Image,No,8
14
+ LLaVA-1.5-7B,40.4,45.0,47.4,40.1,37.0,53.3,35.0,38.8,39.6,44.9,44.1,39.9,43.3,40.7,43.9,26.2,47.3,42.9,37.2,34.7,30.3,45.1,40.3,Image,No,8
15
+ mPLUG-Owl2,39.4,49.4,47.3,38.7,34.3,49.5,37.5,37.3,39.6,45.5,45.9,41.5,39.6,44.6,36.9,24.9,45.7,38.9,30.9,36.6,33.9,38.3,39.1,Image,No,8
16
+ PLLaVA-34B,53.5,60.1,66.8,50.8,49.1,65.9,53.8,53.1,54.9,57.6,58.9,52.4,56.3,54.8,50.6,44.2,60.3,56.1,46.6,47.9,41.4,54.9,53.2,Video,No,16
17
+ PLLaVA-13B,45.1,52.9,54.3,42.9,41.2,57.1,43.5,41.9,47.3,53.5,54.4,46.9,43.7,47.1,43.6,27.2,58.0,44.2,39.6,40.1,30.9,47.0,45.6,Video,No,16
18
+ LLaVA-Next-Video-M7B,43.5,50.9,53.1,42.6,38.9,54.6,41.7,47.2,46.3,52.9,46.8,46.6,45.8,44.9,42.1,24.6,51.3,40.6,39.0,40.1,34.5,39.5,43.5,Video,No,32
19
+ ShareGPT4Video,41.8,46.9,50.1,40.0,38.7,50.2,37.5,44.4,44.2,42.7,43.8,41.2,45.8,41.7,42.7,29.9,50.3,47.2,38.7,39.7,29.3,39.8,39.7,Video,No,16
20
+ PLLaVA-7B,39.2,45.3,47.3,38.5,35.2,52.4,35.3,40.4,39.3,46.8,46.5,39.9,39.3,41.0,36.3,26.2,47.7,41.6,34.1,30.5,27.7,38.3,40.2,Video,No,16
21
+ VideoChat2 (Mistral-7B),41.2,49.3,49.3,39.0,37.5,53.6,40.8,38.5,44.5,53.5,46.8,43.1,47.7,43.6,46.6,10.6,42.0,40.6,38.4,36.3,27.4,43.6,39.3,Video,No,16
22
+ VideoLLaVA,37.6,43.1,44.6,36.4,34.4,49.5,29.6,30.6,40.9,44.9,43.5,33.8,40.6,46.5,38.7,24.3,40.0,42.9,35.1,30.5,23.8,39.5,39.1,Video,No,8
23
+ VideoChat2 (Vicuna 7B),35.1,38.1,40.5,33.5,33.6,44.8,29.0,27.3,36.9,41.7,41.7,34.1,33.1,37.2,39.6,22.6,43.0,30.7,34.1,33.8,28.3,37.2,36.0,Video,No,16