Spaces:
Running
Running
update
Browse files- app.py +204 -306
- constants.py +94 -0
- file/example_eval_results/caption_matching.json +0 -0
- file/example_eval_results/captioning.json +0 -0
- file/example_eval_results/merged_result.json +0 -0
- file/example_eval_results/multi-choice.json +0 -0
- file/example_eval_results/yes_no.json +0 -0
- file/result.csv +13 -0
- file/result.csv.bak +5 -0
- merge_eval_result.py +14 -0
- src/compute.py +121 -0
app.py
CHANGED
@@ -1,334 +1,233 @@
|
|
1 |
-
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
)
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
def restart_space():
|
35 |
-
API.restart_space(repo_id=REPO_ID)
|
36 |
-
|
37 |
-
try:
|
38 |
-
print(EVAL_REQUESTS_PATH)
|
39 |
-
snapshot_download(
|
40 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
-
)
|
42 |
-
except Exception:
|
43 |
-
restart_space()
|
44 |
-
try:
|
45 |
-
print(EVAL_RESULTS_PATH)
|
46 |
-
snapshot_download(
|
47 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
-
)
|
49 |
-
except Exception:
|
50 |
-
restart_space()
|
51 |
-
|
52 |
-
|
53 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
-
leaderboard_df = original_df.copy()
|
55 |
-
|
56 |
-
(
|
57 |
-
finished_eval_queue_df,
|
58 |
-
running_eval_queue_df,
|
59 |
-
pending_eval_queue_df,
|
60 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
-
|
62 |
-
|
63 |
-
# Searching and filtering
|
64 |
-
def update_table(
|
65 |
-
hidden_df: pd.DataFrame,
|
66 |
-
columns: list,
|
67 |
-
type_query: list,
|
68 |
-
precision_query: str,
|
69 |
-
size_query: list,
|
70 |
-
show_deleted: bool,
|
71 |
-
query: str,
|
72 |
):
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
return df
|
77 |
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
|
80 |
-
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
81 |
-
|
82 |
-
|
83 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
84 |
-
always_here_cols = [
|
85 |
-
AutoEvalColumn.model_type_symbol.name,
|
86 |
-
AutoEvalColumn.model.name,
|
87 |
-
]
|
88 |
-
# We use COLS to maintain sorting
|
89 |
-
filtered_df = df[
|
90 |
-
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
91 |
-
]
|
92 |
-
return filtered_df
|
93 |
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
final_df.append(temp_filtered_df)
|
105 |
-
if len(final_df) > 0:
|
106 |
-
filtered_df = pd.concat(final_df)
|
107 |
-
filtered_df = filtered_df.drop_duplicates(
|
108 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
109 |
)
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
if show_deleted:
|
119 |
-
filtered_df = df
|
120 |
-
else: # Show only still on the hub models
|
121 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
122 |
-
|
123 |
-
type_emoji = [t[0] for t in type_query]
|
124 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
125 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
126 |
-
|
127 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
128 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
129 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
130 |
-
filtered_df = filtered_df.loc[mask]
|
131 |
-
|
132 |
-
return filtered_df
|
133 |
-
|
134 |
-
|
135 |
-
demo = gr.Blocks(css=custom_css)
|
136 |
-
with demo:
|
137 |
-
gr.HTML(TITLE)
|
138 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
139 |
-
|
140 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
141 |
-
with gr.TabItem("🏅 TempCompass Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
142 |
-
with gr.Row():
|
143 |
-
with gr.Column():
|
144 |
-
with gr.Row():
|
145 |
-
search_bar = gr.Textbox(
|
146 |
-
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
147 |
-
show_label=False,
|
148 |
-
elem_id="search-bar",
|
149 |
-
)
|
150 |
-
with gr.Row():
|
151 |
-
shown_columns = gr.CheckboxGroup(
|
152 |
-
choices=[
|
153 |
-
c.name
|
154 |
-
for c in fields(AutoEvalColumn)
|
155 |
-
if not c.hidden and not c.never_hidden
|
156 |
-
],
|
157 |
-
value=[
|
158 |
-
c.name
|
159 |
-
for c in fields(AutoEvalColumn)
|
160 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
161 |
-
],
|
162 |
-
label="Select columns to show",
|
163 |
-
elem_id="column-select",
|
164 |
-
interactive=True,
|
165 |
-
)
|
166 |
-
with gr.Row():
|
167 |
-
deleted_models_visibility = gr.Checkbox(
|
168 |
-
value=False, label="Show gated/private/deleted models", interactive=True
|
169 |
-
)
|
170 |
-
with gr.Column(min_width=320):
|
171 |
-
#with gr.Box(elem_id="box-filter"):
|
172 |
-
filter_columns_type = gr.CheckboxGroup(
|
173 |
-
label="Model types",
|
174 |
-
choices=[t.to_str() for t in ModelType],
|
175 |
-
value=[t.to_str() for t in ModelType],
|
176 |
-
interactive=True,
|
177 |
-
elem_id="filter-columns-type",
|
178 |
-
)
|
179 |
-
filter_columns_precision = gr.CheckboxGroup(
|
180 |
-
label="Precision",
|
181 |
-
choices=[i.value.name for i in Precision],
|
182 |
-
value=[i.value.name for i in Precision],
|
183 |
-
interactive=True,
|
184 |
-
elem_id="filter-columns-precision",
|
185 |
-
)
|
186 |
-
filter_columns_size = gr.CheckboxGroup(
|
187 |
-
label="Model sizes (in billions of parameters)",
|
188 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
189 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
190 |
-
interactive=True,
|
191 |
-
elem_id="filter-columns-size",
|
192 |
-
)
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
datatype=TYPES,
|
201 |
-
elem_id="leaderboard-table",
|
202 |
interactive=False,
|
203 |
visible=True,
|
204 |
-
)
|
205 |
-
|
206 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
207 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
208 |
-
value=original_df[COLS],
|
209 |
-
headers=COLS,
|
210 |
-
datatype=TYPES,
|
211 |
-
visible=False,
|
212 |
-
)
|
213 |
-
search_bar.submit(
|
214 |
-
update_table,
|
215 |
-
[
|
216 |
-
hidden_leaderboard_table_for_search,
|
217 |
-
shown_columns,
|
218 |
-
filter_columns_type,
|
219 |
-
filter_columns_precision,
|
220 |
-
filter_columns_size,
|
221 |
-
deleted_models_visibility,
|
222 |
-
search_bar,
|
223 |
-
],
|
224 |
-
leaderboard_table,
|
225 |
-
)
|
226 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
227 |
-
selector.change(
|
228 |
-
update_table,
|
229 |
-
[
|
230 |
-
hidden_leaderboard_table_for_search,
|
231 |
-
shown_columns,
|
232 |
-
filter_columns_type,
|
233 |
-
filter_columns_precision,
|
234 |
-
filter_columns_size,
|
235 |
-
deleted_models_visibility,
|
236 |
-
search_bar,
|
237 |
-
],
|
238 |
-
leaderboard_table,
|
239 |
-
queue=True,
|
240 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
246 |
-
with gr.Column():
|
247 |
-
with gr.Row():
|
248 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
249 |
-
|
250 |
-
with gr.Column():
|
251 |
-
with gr.Accordion(
|
252 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
253 |
-
open=False,
|
254 |
-
):
|
255 |
-
with gr.Row():
|
256 |
-
finished_eval_table = gr.components.Dataframe(
|
257 |
-
value=finished_eval_queue_df,
|
258 |
-
headers=EVAL_COLS,
|
259 |
-
datatype=EVAL_TYPES,
|
260 |
-
row_count=5,
|
261 |
-
)
|
262 |
-
with gr.Accordion(
|
263 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
264 |
-
open=False,
|
265 |
-
):
|
266 |
-
with gr.Row():
|
267 |
-
running_eval_table = gr.components.Dataframe(
|
268 |
-
value=running_eval_queue_df,
|
269 |
-
headers=EVAL_COLS,
|
270 |
-
datatype=EVAL_TYPES,
|
271 |
-
row_count=5,
|
272 |
-
)
|
273 |
|
274 |
-
with gr.Accordion(
|
275 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
276 |
-
open=False,
|
277 |
-
):
|
278 |
-
with gr.Row():
|
279 |
-
pending_eval_table = gr.components.Dataframe(
|
280 |
-
value=pending_eval_queue_df,
|
281 |
-
headers=EVAL_COLS,
|
282 |
-
datatype=EVAL_TYPES,
|
283 |
-
row_count=5,
|
284 |
-
)
|
285 |
with gr.Row():
|
286 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
287 |
|
288 |
with gr.Row():
|
289 |
with gr.Column():
|
290 |
-
model_name_textbox = gr.Textbox(
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
model_type = gr.Dropdown(
|
293 |
-
choices=[
|
|
|
|
|
|
|
|
|
|
|
294 |
label="Model type",
|
295 |
multiselect=False,
|
296 |
value=None,
|
297 |
interactive=True,
|
298 |
)
|
299 |
-
|
300 |
-
|
301 |
-
precision = gr.Dropdown(
|
302 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
303 |
-
label="Precision",
|
304 |
-
multiselect=False,
|
305 |
-
value="float16",
|
306 |
-
interactive=True,
|
307 |
-
)
|
308 |
-
weight_type = gr.Dropdown(
|
309 |
-
choices=[i.value.name for i in WeightType],
|
310 |
-
label="Weights type",
|
311 |
-
multiselect=False,
|
312 |
-
value="Original",
|
313 |
-
interactive=True,
|
314 |
)
|
315 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
316 |
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
|
|
|
|
|
|
|
|
331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
with gr.Row():
|
333 |
with gr.Accordion("📙 Citation", open=False):
|
334 |
citation_button = gr.Textbox(
|
@@ -339,7 +238,6 @@ with demo:
|
|
339 |
show_copy_button=True,
|
340 |
)
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
+
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
+
import re
|
6 |
+
import pdb
|
7 |
+
import tempfile
|
8 |
+
|
9 |
+
from constants import *
|
10 |
+
from src.compute import compute_scores
|
11 |
+
|
12 |
+
global data_component, filter_component
|
13 |
+
|
14 |
+
|
15 |
+
def validate_model_size(s):
|
16 |
+
pattern = r'^\d+B$|^-$'
|
17 |
+
if re.match(pattern, s):
|
18 |
+
return s
|
19 |
+
else:
|
20 |
+
return '-'
|
21 |
+
|
22 |
+
def upload_file(files):
|
23 |
+
file_paths = [file.name for file in files]
|
24 |
+
return file_paths
|
25 |
+
|
26 |
+
def add_new_eval(
|
27 |
+
input_file,
|
28 |
+
model_name_textbox: str,
|
29 |
+
revision_name_textbox: str,
|
30 |
+
model_link: str,
|
31 |
+
model_type: str,
|
32 |
+
model_size: str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
):
|
34 |
+
if input_file is None:
|
35 |
+
return "Error! Empty file!"
|
36 |
+
else:
|
37 |
+
|
38 |
+
model_size = validate_model_size(model_size)
|
39 |
+
|
40 |
+
input_file = compute_scores(input_file)
|
41 |
+
input_data = input_file[1]
|
42 |
+
input_data = [float(i) for i in input_data]
|
43 |
+
|
44 |
+
csv_data = pd.read_csv(CSV_DIR)
|
45 |
+
|
46 |
+
if revision_name_textbox == '':
|
47 |
+
col = csv_data.shape[0]
|
48 |
+
model_name = model_name_textbox
|
49 |
+
name_list = [name.split(']')[0][1:] if name.endswith(')') else name for name in csv_data['Model']]
|
50 |
+
print(name_list)
|
51 |
+
print(model_name)
|
52 |
+
assert model_name not in name_list
|
53 |
+
else:
|
54 |
+
model_name = revision_name_textbox
|
55 |
+
model_name_list = csv_data['Model']
|
56 |
+
name_list = [name.split(']')[0][1:] if name.endswith(')') else name for name in model_name_list]
|
57 |
+
if revision_name_textbox not in name_list:
|
58 |
+
col = csv_data.shape[0]
|
59 |
+
else:
|
60 |
+
col = name_list.index(revision_name_textbox)
|
61 |
+
|
62 |
+
if model_link == '':
|
63 |
+
model_name = model_name # no url
|
64 |
+
else:
|
65 |
+
model_name = '[' + model_name + '](' + model_link + ')'
|
66 |
+
|
67 |
+
# add new data
|
68 |
+
new_data = [
|
69 |
+
model_name,
|
70 |
+
model_type,
|
71 |
+
model_size,
|
72 |
+
input_data[0],
|
73 |
+
input_data[1],
|
74 |
+
input_data[2],
|
75 |
+
input_data[3],
|
76 |
+
input_data[4],
|
77 |
+
input_data[5],
|
78 |
+
input_data[6],
|
79 |
+
input_data[7],
|
80 |
+
input_data[8],
|
81 |
+
input_data[9],
|
82 |
+
input_data[10],
|
83 |
+
input_data[11],
|
84 |
+
input_data[12],
|
85 |
+
input_data[13],
|
86 |
+
input_data[14],
|
87 |
+
input_data[15],
|
88 |
+
input_data[16],
|
89 |
+
]
|
90 |
+
csv_data.loc[col] = new_data
|
91 |
+
# with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
|
92 |
+
# json.dump(new_data, f)
|
93 |
+
csv_data.to_csv(CSV_DIR, index=False)
|
94 |
+
return 0
|
95 |
+
|
96 |
+
def get_baseline_df():
|
97 |
+
# pdb.set_trace()
|
98 |
+
df = pd.read_csv(CSV_DIR)
|
99 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
100 |
+
present_columns = MODEL_INFO + checkbox_group.value
|
101 |
+
df = df[present_columns]
|
102 |
return df
|
103 |
|
104 |
+
def get_all_df():
|
105 |
+
df = pd.read_csv(CSV_DIR)
|
106 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
107 |
+
return df
|
108 |
|
109 |
+
block = gr.Blocks()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
|
112 |
+
with block:
|
113 |
+
gr.Markdown(
|
114 |
+
LEADERBORAD_INTRODUCTION
|
115 |
+
)
|
116 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
117 |
+
with gr.TabItem("🏅 TempCompass Benchmark", elem_id="video-benchmark-tab-table", id=0):
|
118 |
+
|
119 |
+
gr.Markdown(
|
120 |
+
TABLE_INTRODUCTION
|
|
|
|
|
|
|
|
|
|
|
121 |
)
|
122 |
|
123 |
+
# selection for column part:
|
124 |
+
checkbox_group = gr.CheckboxGroup(
|
125 |
+
choices=TASK_INFO_v2,
|
126 |
+
value=AVG_INFO,
|
127 |
+
label="Select options",
|
128 |
+
interactive=True,
|
129 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
+
# 创建数据帧组件
|
132 |
+
data_component = gr.components.Dataframe(
|
133 |
+
value=get_baseline_df,
|
134 |
+
headers=COLUMN_NAMES,
|
135 |
+
type="pandas",
|
136 |
+
datatype=DATA_TITILE_TYPE,
|
|
|
|
|
137 |
interactive=False,
|
138 |
visible=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
)
|
140 |
+
|
141 |
+
def on_checkbox_group_change(selected_columns):
|
142 |
+
# pdb.set_trace()
|
143 |
+
selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
|
144 |
+
present_columns = MODEL_INFO + selected_columns
|
145 |
+
updated_data = get_all_df()[present_columns]
|
146 |
+
updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
|
147 |
+
updated_headers = present_columns
|
148 |
+
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
149 |
+
|
150 |
+
filter_component = gr.components.Dataframe(
|
151 |
+
value=updated_data,
|
152 |
+
headers=updated_headers,
|
153 |
+
type="pandas",
|
154 |
+
datatype=update_datatype,
|
155 |
+
interactive=False,
|
156 |
+
visible=True,
|
157 |
+
)
|
158 |
+
# pdb.set_trace()
|
159 |
+
|
160 |
+
return filter_component.value
|
161 |
+
|
162 |
+
# 将复选框组关联到处理函数
|
163 |
+
checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
|
164 |
+
'''
|
165 |
+
# table 2
|
166 |
+
with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
|
167 |
+
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
|
168 |
+
'''
|
169 |
+
# table 3
|
170 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
|
171 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
|
172 |
|
173 |
+
with gr.Row():
|
174 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
with gr.Row():
|
177 |
+
gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
|
178 |
|
179 |
with gr.Row():
|
180 |
with gr.Column():
|
181 |
+
model_name_textbox = gr.Textbox(
|
182 |
+
label="Model name", placeholder="Chat-UniVi-7B"
|
183 |
+
)
|
184 |
+
revision_name_textbox = gr.Textbox(
|
185 |
+
label="Revision Model Name", placeholder="Chat-UniVi-7B"
|
186 |
+
)
|
187 |
+
model_link = gr.Textbox(
|
188 |
+
label="Model Link", placeholder="https://github.com/PKU-YuanGroup/Chat-UniVi"
|
189 |
+
)
|
190 |
model_type = gr.Dropdown(
|
191 |
+
choices=[
|
192 |
+
"LLM",
|
193 |
+
"ImageLLM",
|
194 |
+
"VideoLLM",
|
195 |
+
"Other",
|
196 |
+
],
|
197 |
label="Model type",
|
198 |
multiselect=False,
|
199 |
value=None,
|
200 |
interactive=True,
|
201 |
)
|
202 |
+
model_size = gr.Textbox(
|
203 |
+
label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
)
|
|
|
205 |
|
206 |
+
with gr.Column():
|
207 |
+
|
208 |
+
input_file = gr.File(label="Click to Upload a json File", type='binary')
|
209 |
+
submit_button = gr.Button("Submit Eval")
|
210 |
+
|
211 |
+
submission_result = gr.Markdown()
|
212 |
+
submit_button.click(
|
213 |
+
add_new_eval,
|
214 |
+
inputs=[
|
215 |
+
input_file,
|
216 |
+
model_name_textbox,
|
217 |
+
revision_name_textbox,
|
218 |
+
model_link,
|
219 |
+
model_type,
|
220 |
+
model_size,
|
221 |
+
],
|
222 |
+
# outputs = submission_result,
|
223 |
+
)
|
224 |
|
225 |
+
with gr.Row():
|
226 |
+
data_run = gr.Button("Refresh")
|
227 |
+
data_run.click(
|
228 |
+
get_baseline_df, outputs=data_component
|
229 |
+
)
|
230 |
+
|
231 |
with gr.Row():
|
232 |
with gr.Accordion("📙 Citation", open=False):
|
233 |
citation_button = gr.Textbox(
|
|
|
238 |
show_copy_button=True,
|
239 |
)
|
240 |
|
241 |
+
# block.load(get_baseline_df, outputs=data_title)
|
242 |
+
|
243 |
+
block.launch()
|
|
constants.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# this is .py for store constants
|
2 |
+
MODEL_INFO = ["Model"]
|
3 |
+
|
4 |
+
TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
|
5 |
+
"ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
|
6 |
+
"TVQA", "MV", "NBA",
|
7 |
+
"Driving-exam", "Driving-decision-making", "SQA3D"]
|
8 |
+
|
9 |
+
AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
|
10 |
+
DATA_TITILE_TYPE = ["markdown",
|
11 |
+
"number", "number", "number", "number", "number", "number", "number",
|
12 |
+
"number", "number", "number",
|
13 |
+
"number", "number", "number",
|
14 |
+
"number", "number", "number", "number", ]
|
15 |
+
CSV_DIR = "./file/result.csv"
|
16 |
+
|
17 |
+
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
18 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
|
19 |
+
|
20 |
+
LEADERBORAD_INTRODUCTION = """
|
21 |
+
Welcome to the leaderboard of TempCompass! 🏆
|
22 |
+
|
23 |
+
TempCompass is a benchmark to evaluate the temporal perception ability of Video LLMs. It consists of 410 videos and 7,540 task instructions, covering 11 temporal aspects and 4 task types. Please refer to [our paper](https://arxiv.org/abs/2403.00476) for more details.
|
24 |
+
"""
|
25 |
+
|
26 |
+
SUBMIT_INTRODUCTION = """
|
27 |
+
# TempCompass Leaderboard
|
28 |
+
|
29 |
+
Welcome to the leaderboard of the Video-Bench! 🏆
|
30 |
+
|
31 |
+
## Submit Instruction
|
32 |
+
Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
|
33 |
+
|
34 |
+
You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
|
35 |
+
|
36 |
+
For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
|
37 |
+
```python
|
38 |
+
{
|
39 |
+
"question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
|
40 |
+
"gt-answer": "D. fighting",
|
41 |
+
"video-llm-prediction": "D",
|
42 |
+
"match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
|
43 |
+
"rating": 1
|
44 |
+
}
|
45 |
+
```
|
46 |
+
|
47 |
+
For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
|
48 |
+
```python
|
49 |
+
{
|
50 |
+
"chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
|
51 |
+
"chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
|
52 |
+
"video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
|
53 |
+
"gt-answer": "A. dunking a basketball",
|
54 |
+
"rating": 0
|
55 |
+
}
|
56 |
+
```
|
57 |
+
|
58 |
+
|
59 |
+
### Submit Example
|
60 |
+
For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
|
61 |
+
1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
|
62 |
+
2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
|
63 |
+
3. Select ‘ImageLLM’ in ‘Model Type’.
|
64 |
+
4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
|
65 |
+
5. Fill in ‘7B’ in ‘Model size’.
|
66 |
+
6. Upload `<task_type>.json`.
|
67 |
+
7. Click the ‘Submit Eval’ button.
|
68 |
+
8. Click ‘Refresh’ to obtain the uploaded leaderboard.
|
69 |
+
|
70 |
+
"""
|
71 |
+
|
72 |
+
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
|
73 |
+
We use accurancy(%) as the primary evaluation metric for each tasks.
|
74 |
+
"""
|
75 |
+
|
76 |
+
LEADERBORAD_INFO = """
|
77 |
+
Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
|
78 |
+
In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
|
79 |
+
SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
|
80 |
+
We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
|
81 |
+
Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
|
82 |
+
We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
|
83 |
+
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
|
84 |
+
"""
|
85 |
+
|
86 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
87 |
+
CITATION_BUTTON_TEXT = r"""
|
88 |
+
@article{liu2024tempcompass,
|
89 |
+
title = {TempCompass: Do Video LLMs Really Understand Videos?},
|
90 |
+
author = {Yuanxin Liu and Shicheng Li and Yi Liu and Yuxiang Wang and Shuhuai Ren and Lei Li and Sishuo Chen and Xu Sun and Lu Hou},
|
91 |
+
year = {2024},
|
92 |
+
journal = {arXiv preprint arXiv: 2403.00476}
|
93 |
+
}
|
94 |
+
"""
|
file/example_eval_results/caption_matching.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
file/example_eval_results/captioning.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
file/example_eval_results/merged_result.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
file/example_eval_results/multi-choice.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
file/example_eval_results/yes_no.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
file/result.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
|
2 |
+
Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
|
3 |
+
[VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
|
4 |
+
[Video-ChatGPT-7B](https://github.com/mbzuai-oryx/Video-ChatGPT),38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
|
5 |
+
[Otter-7B](https://github.com/Luodian/Otter),37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
|
6 |
+
[PandaGPT-7B](https://github.com/yxuansu/PandaGPT),37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
|
7 |
+
[Valley-7B](https://github.com/RupertLuo/Valley),33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
|
8 |
+
[mPLUG-owl-7B](https://github.com/X-PLUG/mPLUG-Owl),33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
|
9 |
+
[Video-LLaMA-7B](https://github.com/DAMO-NLP-SG/Video-LLaMA),32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
|
10 |
+
[Chat-UniVi-7B](https://github.com/PKU-YuanGroup/Chat-UniVi),35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907
|
11 |
+
sphinx-v2,45.53190476,44.22571429,41.81666667,50.55333333,0.5307,0.6845,0.5395,0.5341,0.42,0.2759,0.1111,0.3645,0.4396,0.4504,0.4722,0.5564,0.488
|
12 |
+
Gemini,49.598478632478624,50.63076923076923,47.93666666666667,50.228,0.585,0.6179,0.4742,0.5305,0.4769,0.5477,0.1176,0.4656,0.5318,0.4407,0.5285,0.4129
|
13 |
+
llava_phi_2.7,43.41644444444445,42.97,37.54333333333334,49.736,0.5785,0.608,0.514,0.4542,0.4345,0.1483,0.1111,0.392,0.4763,0.258,0.5538,0.4535
|
file/result.csv.bak
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
|
2 |
+
Random,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
|
3 |
+
[VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
|
4 |
+
Gemini,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
|
5 |
+
llava_phi_2.7,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9
|
merge_eval_result.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json, os
|
2 |
+
|
3 |
+
eval_result_path = "file/example_eval_results"
|
4 |
+
eval_result_files = [f for f in os.listdir(eval_result_path) if f.endswith('.json')]
|
5 |
+
|
6 |
+
merged_result = {}
|
7 |
+
for fn in eval_result_files:
|
8 |
+
task_type = fn.replace('.json', '')
|
9 |
+
with open(f"{eval_result_path}/{fn}", "r") as f:
|
10 |
+
merged_result[task_type] = json.load(f)
|
11 |
+
|
12 |
+
merge_file = f"{eval_result_path}/merged_result.json"
|
13 |
+
with open(merge_file, "w") as f:
|
14 |
+
json.dump(merged_result, f, indent=4)
|
src/compute.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
import argparse
|
5 |
+
import csv
|
6 |
+
|
7 |
+
|
8 |
+
def chatgpt_json(merge_file):
|
9 |
+
# chat results
|
10 |
+
merge_data = merge_file.decode("utf-8")
|
11 |
+
merge_data = eval(merge_data)
|
12 |
+
correct_answer_file = 'file/ANSWER.json'
|
13 |
+
with open(correct_answer_file, 'r', encoding='utf-8') as f:
|
14 |
+
correct_answer_data = json.load(f)
|
15 |
+
|
16 |
+
dataset_scores_dict = {}
|
17 |
+
for dataset_name, item in merge_data.items():
|
18 |
+
|
19 |
+
total_nums = len(item)
|
20 |
+
correct = 0
|
21 |
+
# assert len(item) >= len(correct_answer_data[dataset_name]), f'Video-Bench-Input.json---{dataset_name}---is incomplete!'
|
22 |
+
for id, sub_item in item.items():
|
23 |
+
if sub_item['output_chatgpt_choice'] == correct_answer_data[dataset_name][id]['answer']:
|
24 |
+
correct += 1
|
25 |
+
|
26 |
+
# dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
|
27 |
+
dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
|
28 |
+
return dataset_scores_dict
|
29 |
+
|
30 |
+
|
31 |
+
def compute_scores(merge_file):
|
32 |
+
dataset_score_dict = chatgpt_json(merge_file)
|
33 |
+
dataset_weight = {
|
34 |
+
1:
|
35 |
+
{
|
36 |
+
"ActivityNet": 1,
|
37 |
+
"MSVD": 1,
|
38 |
+
"MSRVTT": 1,
|
39 |
+
"TGIF": 1,
|
40 |
+
"Youcook2": 1,
|
41 |
+
"Ucfcrime": 1,
|
42 |
+
"MOT": 0.5,
|
43 |
+
},
|
44 |
+
|
45 |
+
2:
|
46 |
+
{
|
47 |
+
"TVQA": 1,
|
48 |
+
"MV": 1,
|
49 |
+
"NBA": 1,
|
50 |
+
},
|
51 |
+
|
52 |
+
3:
|
53 |
+
{
|
54 |
+
"Driving-exam": 0.5,
|
55 |
+
"Driving-decision-making": 1,
|
56 |
+
"SQA3D": 1,
|
57 |
+
}
|
58 |
+
|
59 |
+
}
|
60 |
+
|
61 |
+
# Video-exclusive Understanding score
|
62 |
+
exclusive_understanding_weight = dataset_weight[1]
|
63 |
+
weights_sum = sum(exclusive_understanding_weight.values())
|
64 |
+
exclusive_understanding_score = 0
|
65 |
+
# import ipdb; ipdb.set_trace()
|
66 |
+
for dataset_name, weight in exclusive_understanding_weight.items():
|
67 |
+
exclusive_understanding_score += weight * dataset_score_dict[dataset_name] / weights_sum * 100
|
68 |
+
|
69 |
+
# Prior Knowledge-based Question-answer
|
70 |
+
prior_QA_weight = dataset_weight[2]
|
71 |
+
weights_sum = sum(prior_QA_weight.values())
|
72 |
+
prior_QA_score = 0
|
73 |
+
for dataset_name, weight in prior_QA_weight.items():
|
74 |
+
prior_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
|
75 |
+
|
76 |
+
# Comprehension and Decision-making
|
77 |
+
com_and_dec_QA_weight = dataset_weight[3]
|
78 |
+
weights_sum = sum(com_and_dec_QA_weight.values())
|
79 |
+
com_and_dec_QA_score = 0
|
80 |
+
for dataset_name, weight in com_and_dec_QA_weight.items():
|
81 |
+
com_and_dec_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
|
82 |
+
|
83 |
+
dataset_score_dict['Exclusive_understanding'] = exclusive_understanding_score
|
84 |
+
dataset_score_dict['Prior_Knowledge'] = prior_QA_score
|
85 |
+
dataset_score_dict['Comprehension_and_Decision-making'] = com_and_dec_QA_score
|
86 |
+
|
87 |
+
# final score
|
88 |
+
final_score = sum([exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score]) / 3
|
89 |
+
dataset_score_dict['final_score'] = final_score
|
90 |
+
|
91 |
+
# print(dataset_score_dict)
|
92 |
+
# with open(args.score_output_file, 'w', encoding='utf-8') as f:
|
93 |
+
# json.dump(dataset_score_dict, f, indent=2)
|
94 |
+
# print(f'{args.score_output_file} is saved!')
|
95 |
+
# ========================
|
96 |
+
data = [
|
97 |
+
|
98 |
+
["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
|
99 |
+
"ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime",
|
100 |
+
"MOT", "TVQA", "MV", "NBA", "Driving-exam", "Driving-decision-making", "SQA3D"],
|
101 |
+
|
102 |
+
[final_score, exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score,
|
103 |
+
dataset_score_dict['ActivityNet'],
|
104 |
+
dataset_score_dict["MSVD"],
|
105 |
+
dataset_score_dict['MSRVTT'],
|
106 |
+
dataset_score_dict['TGIF'],
|
107 |
+
dataset_score_dict['Youcook2'],
|
108 |
+
dataset_score_dict['Ucfcrime'],
|
109 |
+
dataset_score_dict['MOT'],
|
110 |
+
dataset_score_dict['TVQA'],
|
111 |
+
dataset_score_dict['MV'],
|
112 |
+
dataset_score_dict['NBA'],
|
113 |
+
dataset_score_dict['Driving-exam'],
|
114 |
+
dataset_score_dict['Driving-decision-making'],
|
115 |
+
dataset_score_dict['SQA3D'],
|
116 |
+
],
|
117 |
+
]
|
118 |
+
|
119 |
+
|
120 |
+
return data
|
121 |
+
|