lyx97 commited on
Commit
5d57406
1 Parent(s): b23950f
app.py CHANGED
@@ -1,334 +1,233 @@
1
- import subprocess
 
2
  import gradio as gr
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- NUMERIC_INTERVALS,
22
- TYPES,
23
- AutoEvalColumn,
24
- ModelType,
25
- fields,
26
- WeightType,
27
- Precision
28
- )
29
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
- from src.submission.submit import add_new_eval
32
-
33
-
34
- def restart_space():
35
- API.restart_space(repo_id=REPO_ID)
36
-
37
- try:
38
- print(EVAL_REQUESTS_PATH)
39
- snapshot_download(
40
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
- )
42
- except Exception:
43
- restart_space()
44
- try:
45
- print(EVAL_RESULTS_PATH)
46
- snapshot_download(
47
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
- )
49
- except Exception:
50
- restart_space()
51
-
52
-
53
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
- leaderboard_df = original_df.copy()
55
-
56
- (
57
- finished_eval_queue_df,
58
- running_eval_queue_df,
59
- pending_eval_queue_df,
60
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
-
62
-
63
- # Searching and filtering
64
- def update_table(
65
- hidden_df: pd.DataFrame,
66
- columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
- show_deleted: bool,
71
- query: str,
72
  ):
73
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
- filtered_df = filter_queries(query, filtered_df)
75
- df = select_columns(filtered_df, columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  return df
77
 
 
 
 
 
78
 
79
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
80
- return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
81
-
82
-
83
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
84
- always_here_cols = [
85
- AutoEvalColumn.model_type_symbol.name,
86
- AutoEvalColumn.model.name,
87
- ]
88
- # We use COLS to maintain sorting
89
- filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
- ]
92
- return filtered_df
93
 
94
 
95
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
96
- final_df = []
97
- if query != "":
98
- queries = [q.strip() for q in query.split(";")]
99
- for _q in queries:
100
- _q = _q.strip()
101
- if _q != "":
102
- temp_filtered_df = search_table(filtered_df, _q)
103
- if len(temp_filtered_df) > 0:
104
- final_df.append(temp_filtered_df)
105
- if len(final_df) > 0:
106
- filtered_df = pd.concat(final_df)
107
- filtered_df = filtered_df.drop_duplicates(
108
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
109
  )
110
 
111
- return filtered_df
112
-
113
-
114
- def filter_models(
115
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
- ) -> pd.DataFrame:
117
- # Show all models
118
- if show_deleted:
119
- filtered_df = df
120
- else: # Show only still on the hub models
121
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
122
-
123
- type_emoji = [t[0] for t in type_query]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
125
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
126
-
127
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
128
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
129
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
130
- filtered_df = filtered_df.loc[mask]
131
-
132
- return filtered_df
133
-
134
-
135
- demo = gr.Blocks(css=custom_css)
136
- with demo:
137
- gr.HTML(TITLE)
138
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
139
-
140
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
141
- with gr.TabItem("🏅 TempCompass Benchmark", elem_id="llm-benchmark-tab-table", id=0):
142
- with gr.Row():
143
- with gr.Column():
144
- with gr.Row():
145
- search_bar = gr.Textbox(
146
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
147
- show_label=False,
148
- elem_id="search-bar",
149
- )
150
- with gr.Row():
151
- shown_columns = gr.CheckboxGroup(
152
- choices=[
153
- c.name
154
- for c in fields(AutoEvalColumn)
155
- if not c.hidden and not c.never_hidden
156
- ],
157
- value=[
158
- c.name
159
- for c in fields(AutoEvalColumn)
160
- if c.displayed_by_default and not c.hidden and not c.never_hidden
161
- ],
162
- label="Select columns to show",
163
- elem_id="column-select",
164
- interactive=True,
165
- )
166
- with gr.Row():
167
- deleted_models_visibility = gr.Checkbox(
168
- value=False, label="Show gated/private/deleted models", interactive=True
169
- )
170
- with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
- filter_columns_type = gr.CheckboxGroup(
173
- label="Model types",
174
- choices=[t.to_str() for t in ModelType],
175
- value=[t.to_str() for t in ModelType],
176
- interactive=True,
177
- elem_id="filter-columns-type",
178
- )
179
- filter_columns_precision = gr.CheckboxGroup(
180
- label="Precision",
181
- choices=[i.value.name for i in Precision],
182
- value=[i.value.name for i in Precision],
183
- interactive=True,
184
- elem_id="filter-columns-precision",
185
- )
186
- filter_columns_size = gr.CheckboxGroup(
187
- label="Model sizes (in billions of parameters)",
188
- choices=list(NUMERIC_INTERVALS.keys()),
189
- value=list(NUMERIC_INTERVALS.keys()),
190
- interactive=True,
191
- elem_id="filter-columns-size",
192
- )
193
 
194
- leaderboard_table = gr.components.Dataframe(
195
- value=leaderboard_df[
196
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
- + shown_columns.value
198
- ],
199
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
- datatype=TYPES,
201
- elem_id="leaderboard-table",
202
  interactive=False,
203
  visible=True,
204
- )
205
-
206
- # Dummy leaderboard for handling the case when the user uses backspace key
207
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
208
- value=original_df[COLS],
209
- headers=COLS,
210
- datatype=TYPES,
211
- visible=False,
212
- )
213
- search_bar.submit(
214
- update_table,
215
- [
216
- hidden_leaderboard_table_for_search,
217
- shown_columns,
218
- filter_columns_type,
219
- filter_columns_precision,
220
- filter_columns_size,
221
- deleted_models_visibility,
222
- search_bar,
223
- ],
224
- leaderboard_table,
225
- )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
227
- selector.change(
228
- update_table,
229
- [
230
- hidden_leaderboard_table_for_search,
231
- shown_columns,
232
- filter_columns_type,
233
- filter_columns_precision,
234
- filter_columns_size,
235
- deleted_models_visibility,
236
- search_bar,
237
- ],
238
- leaderboard_table,
239
- queue=True,
240
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
243
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
244
-
245
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
246
- with gr.Column():
247
- with gr.Row():
248
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
249
-
250
- with gr.Column():
251
- with gr.Accordion(
252
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
253
- open=False,
254
- ):
255
- with gr.Row():
256
- finished_eval_table = gr.components.Dataframe(
257
- value=finished_eval_queue_df,
258
- headers=EVAL_COLS,
259
- datatype=EVAL_TYPES,
260
- row_count=5,
261
- )
262
- with gr.Accordion(
263
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
264
- open=False,
265
- ):
266
- with gr.Row():
267
- running_eval_table = gr.components.Dataframe(
268
- value=running_eval_queue_df,
269
- headers=EVAL_COLS,
270
- datatype=EVAL_TYPES,
271
- row_count=5,
272
- )
273
 
274
- with gr.Accordion(
275
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
276
- open=False,
277
- ):
278
- with gr.Row():
279
- pending_eval_table = gr.components.Dataframe(
280
- value=pending_eval_queue_df,
281
- headers=EVAL_COLS,
282
- datatype=EVAL_TYPES,
283
- row_count=5,
284
- )
285
  with gr.Row():
286
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
287
 
288
  with gr.Row():
289
  with gr.Column():
290
- model_name_textbox = gr.Textbox(label="Model name")
291
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
 
 
 
 
 
 
292
  model_type = gr.Dropdown(
293
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
 
 
 
 
294
  label="Model type",
295
  multiselect=False,
296
  value=None,
297
  interactive=True,
298
  )
299
-
300
- with gr.Column():
301
- precision = gr.Dropdown(
302
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
303
- label="Precision",
304
- multiselect=False,
305
- value="float16",
306
- interactive=True,
307
- )
308
- weight_type = gr.Dropdown(
309
- choices=[i.value.name for i in WeightType],
310
- label="Weights type",
311
- multiselect=False,
312
- value="Original",
313
- interactive=True,
314
  )
315
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
316
 
317
- submit_button = gr.Button("Submit Eval")
318
- submission_result = gr.Markdown()
319
- submit_button.click(
320
- add_new_eval,
321
- [
322
- model_name_textbox,
323
- base_model_name_textbox,
324
- revision_name_textbox,
325
- precision,
326
- weight_type,
327
- model_type,
328
- ],
329
- submission_result,
330
- )
 
 
 
 
331
 
 
 
 
 
 
 
332
  with gr.Row():
333
  with gr.Accordion("📙 Citation", open=False):
334
  citation_button = gr.Textbox(
@@ -339,7 +238,6 @@ with demo:
339
  show_copy_button=True,
340
  )
341
 
342
- scheduler = BackgroundScheduler()
343
- scheduler.add_job(restart_space, "interval", seconds=1800)
344
- scheduler.start()
345
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
+
3
  import gradio as gr
4
  import pandas as pd
5
+ import re
6
+ import pdb
7
+ import tempfile
8
+
9
+ from constants import *
10
+ from src.compute import compute_scores
11
+
12
+ global data_component, filter_component
13
+
14
+
15
+ def validate_model_size(s):
16
+ pattern = r'^\d+B$|^-$'
17
+ if re.match(pattern, s):
18
+ return s
19
+ else:
20
+ return '-'
21
+
22
+ def upload_file(files):
23
+ file_paths = [file.name for file in files]
24
+ return file_paths
25
+
26
+ def add_new_eval(
27
+ input_file,
28
+ model_name_textbox: str,
29
+ revision_name_textbox: str,
30
+ model_link: str,
31
+ model_type: str,
32
+ model_size: str,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  ):
34
+ if input_file is None:
35
+ return "Error! Empty file!"
36
+ else:
37
+
38
+ model_size = validate_model_size(model_size)
39
+
40
+ input_file = compute_scores(input_file)
41
+ input_data = input_file[1]
42
+ input_data = [float(i) for i in input_data]
43
+
44
+ csv_data = pd.read_csv(CSV_DIR)
45
+
46
+ if revision_name_textbox == '':
47
+ col = csv_data.shape[0]
48
+ model_name = model_name_textbox
49
+ name_list = [name.split(']')[0][1:] if name.endswith(')') else name for name in csv_data['Model']]
50
+ print(name_list)
51
+ print(model_name)
52
+ assert model_name not in name_list
53
+ else:
54
+ model_name = revision_name_textbox
55
+ model_name_list = csv_data['Model']
56
+ name_list = [name.split(']')[0][1:] if name.endswith(')') else name for name in model_name_list]
57
+ if revision_name_textbox not in name_list:
58
+ col = csv_data.shape[0]
59
+ else:
60
+ col = name_list.index(revision_name_textbox)
61
+
62
+ if model_link == '':
63
+ model_name = model_name # no url
64
+ else:
65
+ model_name = '[' + model_name + '](' + model_link + ')'
66
+
67
+ # add new data
68
+ new_data = [
69
+ model_name,
70
+ model_type,
71
+ model_size,
72
+ input_data[0],
73
+ input_data[1],
74
+ input_data[2],
75
+ input_data[3],
76
+ input_data[4],
77
+ input_data[5],
78
+ input_data[6],
79
+ input_data[7],
80
+ input_data[8],
81
+ input_data[9],
82
+ input_data[10],
83
+ input_data[11],
84
+ input_data[12],
85
+ input_data[13],
86
+ input_data[14],
87
+ input_data[15],
88
+ input_data[16],
89
+ ]
90
+ csv_data.loc[col] = new_data
91
+ # with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
92
+ # json.dump(new_data, f)
93
+ csv_data.to_csv(CSV_DIR, index=False)
94
+ return 0
95
+
96
+ def get_baseline_df():
97
+ # pdb.set_trace()
98
+ df = pd.read_csv(CSV_DIR)
99
+ df = df.sort_values(by="Avg. All", ascending=False)
100
+ present_columns = MODEL_INFO + checkbox_group.value
101
+ df = df[present_columns]
102
  return df
103
 
104
+ def get_all_df():
105
+ df = pd.read_csv(CSV_DIR)
106
+ df = df.sort_values(by="Avg. All", ascending=False)
107
+ return df
108
 
109
+ block = gr.Blocks()
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
+ with block:
113
+ gr.Markdown(
114
+ LEADERBORAD_INTRODUCTION
115
+ )
116
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
117
+ with gr.TabItem("🏅 TempCompass Benchmark", elem_id="video-benchmark-tab-table", id=0):
118
+
119
+ gr.Markdown(
120
+ TABLE_INTRODUCTION
 
 
 
 
 
121
  )
122
 
123
+ # selection for column part:
124
+ checkbox_group = gr.CheckboxGroup(
125
+ choices=TASK_INFO_v2,
126
+ value=AVG_INFO,
127
+ label="Select options",
128
+ interactive=True,
129
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ # 创建数据帧组件
132
+ data_component = gr.components.Dataframe(
133
+ value=get_baseline_df,
134
+ headers=COLUMN_NAMES,
135
+ type="pandas",
136
+ datatype=DATA_TITILE_TYPE,
 
 
137
  interactive=False,
138
  visible=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  )
140
+
141
+ def on_checkbox_group_change(selected_columns):
142
+ # pdb.set_trace()
143
+ selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
144
+ present_columns = MODEL_INFO + selected_columns
145
+ updated_data = get_all_df()[present_columns]
146
+ updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
147
+ updated_headers = present_columns
148
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
149
+
150
+ filter_component = gr.components.Dataframe(
151
+ value=updated_data,
152
+ headers=updated_headers,
153
+ type="pandas",
154
+ datatype=update_datatype,
155
+ interactive=False,
156
+ visible=True,
157
+ )
158
+ # pdb.set_trace()
159
+
160
+ return filter_component.value
161
+
162
+ # 将复选框组关联到处理函数
163
+ checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
164
+ '''
165
+ # table 2
166
+ with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
167
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
168
+ '''
169
+ # table 3
170
+ with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
171
+ gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
172
 
173
+ with gr.Row():
174
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
 
 
 
 
 
 
 
 
 
 
 
176
  with gr.Row():
177
+ gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
178
 
179
  with gr.Row():
180
  with gr.Column():
181
+ model_name_textbox = gr.Textbox(
182
+ label="Model name", placeholder="Chat-UniVi-7B"
183
+ )
184
+ revision_name_textbox = gr.Textbox(
185
+ label="Revision Model Name", placeholder="Chat-UniVi-7B"
186
+ )
187
+ model_link = gr.Textbox(
188
+ label="Model Link", placeholder="https://github.com/PKU-YuanGroup/Chat-UniVi"
189
+ )
190
  model_type = gr.Dropdown(
191
+ choices=[
192
+ "LLM",
193
+ "ImageLLM",
194
+ "VideoLLM",
195
+ "Other",
196
+ ],
197
  label="Model type",
198
  multiselect=False,
199
  value=None,
200
  interactive=True,
201
  )
202
+ model_size = gr.Textbox(
203
+ label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  )
 
205
 
206
+ with gr.Column():
207
+
208
+ input_file = gr.File(label="Click to Upload a json File", type='binary')
209
+ submit_button = gr.Button("Submit Eval")
210
+
211
+ submission_result = gr.Markdown()
212
+ submit_button.click(
213
+ add_new_eval,
214
+ inputs=[
215
+ input_file,
216
+ model_name_textbox,
217
+ revision_name_textbox,
218
+ model_link,
219
+ model_type,
220
+ model_size,
221
+ ],
222
+ # outputs = submission_result,
223
+ )
224
 
225
+ with gr.Row():
226
+ data_run = gr.Button("Refresh")
227
+ data_run.click(
228
+ get_baseline_df, outputs=data_component
229
+ )
230
+
231
  with gr.Row():
232
  with gr.Accordion("📙 Citation", open=False):
233
  citation_button = gr.Textbox(
 
238
  show_copy_button=True,
239
  )
240
 
241
+ # block.load(get_baseline_df, outputs=data_title)
242
+
243
+ block.launch()
 
constants.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Model"]
3
+
4
+ TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
5
+ "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
6
+ "TVQA", "MV", "NBA",
7
+ "Driving-exam", "Driving-decision-making", "SQA3D"]
8
+
9
+ AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
10
+ DATA_TITILE_TYPE = ["markdown",
11
+ "number", "number", "number", "number", "number", "number", "number",
12
+ "number", "number", "number",
13
+ "number", "number", "number",
14
+ "number", "number", "number", "number", ]
15
+ CSV_DIR = "./file/result.csv"
16
+
17
+ # COLUMN_NAMES = MODEL_INFO + TASK_INFO
18
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
19
+
20
+ LEADERBORAD_INTRODUCTION = """
21
+ Welcome to the leaderboard of TempCompass! 🏆
22
+
23
+ TempCompass is a benchmark to evaluate the temporal perception ability of Video LLMs. It consists of 410 videos and 7,540 task instructions, covering 11 temporal aspects and 4 task types. Please refer to [our paper](https://arxiv.org/abs/2403.00476) for more details.
24
+ """
25
+
26
+ SUBMIT_INTRODUCTION = """
27
+ # TempCompass Leaderboard
28
+
29
+ Welcome to the leaderboard of the Video-Bench! 🏆
30
+
31
+ ## Submit Instruction
32
+ Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
33
+
34
+ You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
35
+
36
+ For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
37
+ ```python
38
+ {
39
+ "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
40
+ "gt-answer": "D. fighting",
41
+ "video-llm-prediction": "D",
42
+ "match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
43
+ "rating": 1
44
+ }
45
+ ```
46
+
47
+ For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
48
+ ```python
49
+ {
50
+ "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
51
+ "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
52
+ "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
53
+ "gt-answer": "A. dunking a basketball",
54
+ "rating": 0
55
+ }
56
+ ```
57
+
58
+
59
+ ### Submit Example
60
+ For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
61
+ 1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
62
+ 2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
63
+ 3. Select ‘ImageLLM’ in ‘Model Type’.
64
+ 4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
65
+ 5. Fill in ‘7B’ in ‘Model size’.
66
+ 6. Upload `<task_type>.json`.
67
+ 7. Click the ‘Submit Eval’ button.
68
+ 8. Click ‘Refresh’ to obtain the uploaded leaderboard.
69
+
70
+ """
71
+
72
+ TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
73
+ We use accurancy(%) as the primary evaluation metric for each tasks.
74
+ """
75
+
76
+ LEADERBORAD_INFO = """
77
+ Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
78
+ In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
79
+ SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
80
+ We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
81
+ Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
82
+ We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
83
+ By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
84
+ """
85
+
86
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
87
+ CITATION_BUTTON_TEXT = r"""
88
+ @article{liu2024tempcompass,
89
+ title = {TempCompass: Do Video LLMs Really Understand Videos?},
90
+ author = {Yuanxin Liu and Shicheng Li and Yi Liu and Yuxiang Wang and Shuhuai Ren and Lei Li and Sishuo Chen and Xu Sun and Lu Hou},
91
+ year = {2024},
92
+ journal = {arXiv preprint arXiv: 2403.00476}
93
+ }
94
+ """
file/example_eval_results/caption_matching.json ADDED
The diff for this file is too large to render. See raw diff
 
file/example_eval_results/captioning.json ADDED
The diff for this file is too large to render. See raw diff
 
file/example_eval_results/merged_result.json ADDED
The diff for this file is too large to render. See raw diff
 
file/example_eval_results/multi-choice.json ADDED
The diff for this file is too large to render. See raw diff
 
file/example_eval_results/yes_no.json ADDED
The diff for this file is too large to render. See raw diff
 
file/result.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2
+ Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
3
+ [VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
4
+ [Video-ChatGPT-7B](https://github.com/mbzuai-oryx/Video-ChatGPT),38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
5
+ [Otter-7B](https://github.com/Luodian/Otter),37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
6
+ [PandaGPT-7B](https://github.com/yxuansu/PandaGPT),37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
7
+ [Valley-7B](https://github.com/RupertLuo/Valley),33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
8
+ [mPLUG-owl-7B](https://github.com/X-PLUG/mPLUG-Owl),33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
9
+ [Video-LLaMA-7B](https://github.com/DAMO-NLP-SG/Video-LLaMA),32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
10
+ [Chat-UniVi-7B](https://github.com/PKU-YuanGroup/Chat-UniVi),35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907
11
+ sphinx-v2,45.53190476,44.22571429,41.81666667,50.55333333,0.5307,0.6845,0.5395,0.5341,0.42,0.2759,0.1111,0.3645,0.4396,0.4504,0.4722,0.5564,0.488
12
+ Gemini,49.598478632478624,50.63076923076923,47.93666666666667,50.228,0.585,0.6179,0.4742,0.5305,0.4769,0.5477,0.1176,0.4656,0.5318,0.4407,0.5285,0.4129
13
+ llava_phi_2.7,43.41644444444445,42.97,37.54333333333334,49.736,0.5785,0.608,0.514,0.4542,0.4345,0.1483,0.1111,0.392,0.4763,0.258,0.5538,0.4535
file/result.csv.bak ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Model,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
2
+ Random,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
3
+ [VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
4
+ Gemini,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
5
+ llava_phi_2.7,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9
merge_eval_result.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, os
2
+
3
+ eval_result_path = "file/example_eval_results"
4
+ eval_result_files = [f for f in os.listdir(eval_result_path) if f.endswith('.json')]
5
+
6
+ merged_result = {}
7
+ for fn in eval_result_files:
8
+ task_type = fn.replace('.json', '')
9
+ with open(f"{eval_result_path}/{fn}", "r") as f:
10
+ merged_result[task_type] = json.load(f)
11
+
12
+ merge_file = f"{eval_result_path}/merged_result.json"
13
+ with open(merge_file, "w") as f:
14
+ json.dump(merged_result, f, indent=4)
src/compute.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import glob
4
+ import argparse
5
+ import csv
6
+
7
+
8
+ def chatgpt_json(merge_file):
9
+ # chat results
10
+ merge_data = merge_file.decode("utf-8")
11
+ merge_data = eval(merge_data)
12
+ correct_answer_file = 'file/ANSWER.json'
13
+ with open(correct_answer_file, 'r', encoding='utf-8') as f:
14
+ correct_answer_data = json.load(f)
15
+
16
+ dataset_scores_dict = {}
17
+ for dataset_name, item in merge_data.items():
18
+
19
+ total_nums = len(item)
20
+ correct = 0
21
+ # assert len(item) >= len(correct_answer_data[dataset_name]), f'Video-Bench-Input.json---{dataset_name}---is incomplete!'
22
+ for id, sub_item in item.items():
23
+ if sub_item['output_chatgpt_choice'] == correct_answer_data[dataset_name][id]['answer']:
24
+ correct += 1
25
+
26
+ # dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
27
+ dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
28
+ return dataset_scores_dict
29
+
30
+
31
+ def compute_scores(merge_file):
32
+ dataset_score_dict = chatgpt_json(merge_file)
33
+ dataset_weight = {
34
+ 1:
35
+ {
36
+ "ActivityNet": 1,
37
+ "MSVD": 1,
38
+ "MSRVTT": 1,
39
+ "TGIF": 1,
40
+ "Youcook2": 1,
41
+ "Ucfcrime": 1,
42
+ "MOT": 0.5,
43
+ },
44
+
45
+ 2:
46
+ {
47
+ "TVQA": 1,
48
+ "MV": 1,
49
+ "NBA": 1,
50
+ },
51
+
52
+ 3:
53
+ {
54
+ "Driving-exam": 0.5,
55
+ "Driving-decision-making": 1,
56
+ "SQA3D": 1,
57
+ }
58
+
59
+ }
60
+
61
+ # Video-exclusive Understanding score
62
+ exclusive_understanding_weight = dataset_weight[1]
63
+ weights_sum = sum(exclusive_understanding_weight.values())
64
+ exclusive_understanding_score = 0
65
+ # import ipdb; ipdb.set_trace()
66
+ for dataset_name, weight in exclusive_understanding_weight.items():
67
+ exclusive_understanding_score += weight * dataset_score_dict[dataset_name] / weights_sum * 100
68
+
69
+ # Prior Knowledge-based Question-answer
70
+ prior_QA_weight = dataset_weight[2]
71
+ weights_sum = sum(prior_QA_weight.values())
72
+ prior_QA_score = 0
73
+ for dataset_name, weight in prior_QA_weight.items():
74
+ prior_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
75
+
76
+ # Comprehension and Decision-making
77
+ com_and_dec_QA_weight = dataset_weight[3]
78
+ weights_sum = sum(com_and_dec_QA_weight.values())
79
+ com_and_dec_QA_score = 0
80
+ for dataset_name, weight in com_and_dec_QA_weight.items():
81
+ com_and_dec_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
82
+
83
+ dataset_score_dict['Exclusive_understanding'] = exclusive_understanding_score
84
+ dataset_score_dict['Prior_Knowledge'] = prior_QA_score
85
+ dataset_score_dict['Comprehension_and_Decision-making'] = com_and_dec_QA_score
86
+
87
+ # final score
88
+ final_score = sum([exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score]) / 3
89
+ dataset_score_dict['final_score'] = final_score
90
+
91
+ # print(dataset_score_dict)
92
+ # with open(args.score_output_file, 'w', encoding='utf-8') as f:
93
+ # json.dump(dataset_score_dict, f, indent=2)
94
+ # print(f'{args.score_output_file} is saved!')
95
+ # ========================
96
+ data = [
97
+
98
+ ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
99
+ "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime",
100
+ "MOT", "TVQA", "MV", "NBA", "Driving-exam", "Driving-decision-making", "SQA3D"],
101
+
102
+ [final_score, exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score,
103
+ dataset_score_dict['ActivityNet'],
104
+ dataset_score_dict["MSVD"],
105
+ dataset_score_dict['MSRVTT'],
106
+ dataset_score_dict['TGIF'],
107
+ dataset_score_dict['Youcook2'],
108
+ dataset_score_dict['Ucfcrime'],
109
+ dataset_score_dict['MOT'],
110
+ dataset_score_dict['TVQA'],
111
+ dataset_score_dict['MV'],
112
+ dataset_score_dict['NBA'],
113
+ dataset_score_dict['Driving-exam'],
114
+ dataset_score_dict['Driving-decision-making'],
115
+ dataset_score_dict['SQA3D'],
116
+ ],
117
+ ]
118
+
119
+
120
+ return data
121
+