lyx97 commited on
Commit
6de388e
1 Parent(s): 5d57406
Files changed (5) hide show
  1. app.py +14 -3
  2. constants.py +15 -11
  3. file/result.csv +6 -13
  4. file/result.csv.bak +0 -5
  5. src/compute.py +50 -88
app.py CHANGED
@@ -86,7 +86,18 @@ def add_new_eval(
86
  input_data[14],
87
  input_data[15],
88
  input_data[16],
 
 
 
 
 
 
 
 
89
  ]
 
 
 
90
  csv_data.loc[col] = new_data
91
  # with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
92
  # json.dump(new_data, f)
@@ -122,7 +133,7 @@ with block:
122
 
123
  # selection for column part:
124
  checkbox_group = gr.CheckboxGroup(
125
- choices=TASK_INFO_v2,
126
  value=AVG_INFO,
127
  label="Select options",
128
  interactive=True,
@@ -140,7 +151,7 @@ with block:
140
 
141
  def on_checkbox_group_change(selected_columns):
142
  # pdb.set_trace()
143
- selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
144
  present_columns = MODEL_INFO + selected_columns
145
  updated_data = get_all_df()[present_columns]
146
  updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
@@ -157,7 +168,7 @@ with block:
157
  )
158
  # pdb.set_trace()
159
 
160
- return filter_component.value
161
 
162
  # 将复选框组关联到处理函数
163
  checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
 
86
  input_data[14],
87
  input_data[15],
88
  input_data[16],
89
+ input_data[17],
90
+ input_data[18],
91
+ input_data[19],
92
+ input_data[20],
93
+ input_data[21],
94
+ input_data[22],
95
+ input_data[23],
96
+ input_data[24],
97
  ]
98
+ print(len(new_data), col)
99
+ print(csv_data.loc[col-1])
100
+ print(model_name, model_type, model_size)
101
  csv_data.loc[col] = new_data
102
  # with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
103
  # json.dump(new_data, f)
 
133
 
134
  # selection for column part:
135
  checkbox_group = gr.CheckboxGroup(
136
+ choices=TASK_INFO,
137
  value=AVG_INFO,
138
  label="Select options",
139
  interactive=True,
 
151
 
152
  def on_checkbox_group_change(selected_columns):
153
  # pdb.set_trace()
154
+ selected_columns = [item for item in TASK_INFO if item in selected_columns]
155
  present_columns = MODEL_INFO + selected_columns
156
  updated_data = get_all_df()[present_columns]
157
  updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
 
168
  )
169
  # pdb.set_trace()
170
 
171
+ return filter_component.constructor_args['value']
172
 
173
  # 将复选框组关联到处理函数
174
  checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
constants.py CHANGED
@@ -1,21 +1,25 @@
1
  # this is .py for store constants
2
- MODEL_INFO = ["Model"]
3
 
4
- TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
5
- "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
6
- "TVQA", "MV", "NBA",
7
- "Driving-exam", "Driving-decision-making", "SQA3D"]
 
 
8
 
9
- AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
10
  DATA_TITILE_TYPE = ["markdown",
11
- "number", "number", "number", "number", "number", "number", "number",
12
- "number", "number", "number",
13
- "number", "number", "number",
14
- "number", "number", "number", "number", ]
 
 
15
  CSV_DIR = "./file/result.csv"
16
 
17
  # COLUMN_NAMES = MODEL_INFO + TASK_INFO
18
- COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
19
 
20
  LEADERBORAD_INTRODUCTION = """
21
  Welcome to the leaderboard of TempCompass! 🏆
 
1
  # this is .py for store constants
2
+ MODEL_INFO = ["Model", "Model Type", "Model Size"]
3
 
4
+ TASK_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation",
5
+ "Action. Multi-Choice", "Action. Yes/No", "Action. Caption Matching", "Action. Caption Generation",
6
+ "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
7
+ "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
8
+ "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
9
+ "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"]
10
 
11
+ AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
12
  DATA_TITILE_TYPE = ["markdown",
13
+ "number", "number", "number", "number", "number",
14
+ "number", "number", "number", "number",
15
+ "number", "number", "number", "number",
16
+ "number", "number", "number", "number",
17
+ "number", "number", "number", "number",
18
+ "number", "number", "number", "number",]
19
  CSV_DIR = "./file/result.csv"
20
 
21
  # COLUMN_NAMES = MODEL_INFO + TASK_INFO
22
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
23
 
24
  LEADERBORAD_INTRODUCTION = """
25
  Welcome to the leaderboard of TempCompass! 🏆
file/result.csv CHANGED
@@ -1,13 +1,6 @@
1
- Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2
- Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
3
- [VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
4
- [Video-ChatGPT-7B](https://github.com/mbzuai-oryx/Video-ChatGPT),38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
5
- [Otter-7B](https://github.com/Luodian/Otter),37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
6
- [PandaGPT-7B](https://github.com/yxuansu/PandaGPT),37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
7
- [Valley-7B](https://github.com/RupertLuo/Valley),33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
8
- [mPLUG-owl-7B](https://github.com/X-PLUG/mPLUG-Owl),33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
9
- [Video-LLaMA-7B](https://github.com/DAMO-NLP-SG/Video-LLaMA),32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
10
- [Chat-UniVi-7B](https://github.com/PKU-YuanGroup/Chat-UniVi),35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907
11
- sphinx-v2,45.53190476,44.22571429,41.81666667,50.55333333,0.5307,0.6845,0.5395,0.5341,0.42,0.2759,0.1111,0.3645,0.4396,0.4504,0.4722,0.5564,0.488
12
- Gemini,49.598478632478624,50.63076923076923,47.93666666666667,50.228,0.585,0.6179,0.4742,0.5305,0.4769,0.5477,0.1176,0.4656,0.5318,0.4407,0.5285,0.4129
13
- llava_phi_2.7,43.41644444444445,42.97,37.54333333333334,49.736,0.5785,0.608,0.514,0.4542,0.4345,0.1483,0.1111,0.392,0.4763,0.258,0.5538,0.4535
 
1
+ Model,Model Type,Model Size,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
2
+ Random,Others,-,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
3
+ [VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),Video-LLM,7B,26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
4
+ Gemini,Video-LLM,-,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
5
+ llava_phi_2.7,Image-LLM,-,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9
6
+ ,[],-,49.77,45.57,56.38,63.34,34.83,76.04,74.32,87.88,50.76,35.65,50.28,58.42,23.2,35.22,51.82,53.82,28.67,37.75,49.21,59.0,38.25,40.97,51.12,58.33,33.59
 
 
 
 
 
 
 
file/result.csv.bak DELETED
@@ -1,5 +0,0 @@
1
- Model,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
2
- Random,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
3
- [VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
4
- Gemini,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
5
- llava_phi_2.7,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9
 
 
 
 
 
 
src/compute.py CHANGED
@@ -8,85 +8,52 @@ import csv
8
  def chatgpt_json(merge_file):
9
  # chat results
10
  merge_data = merge_file.decode("utf-8")
 
 
11
  merge_data = eval(merge_data)
12
- correct_answer_file = 'file/ANSWER.json'
13
- with open(correct_answer_file, 'r', encoding='utf-8') as f:
14
- correct_answer_data = json.load(f)
15
 
16
  dataset_scores_dict = {}
17
- for dataset_name, item in merge_data.items():
18
 
19
- total_nums = len(item)
20
- correct = 0
21
- # assert len(item) >= len(correct_answer_data[dataset_name]), f'Video-Bench-Input.json---{dataset_name}---is incomplete!'
22
- for id, sub_item in item.items():
23
- if sub_item['output_chatgpt_choice'] == correct_answer_data[dataset_name][id]['answer']:
24
- correct += 1
25
 
26
- # dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
27
- dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
28
  return dataset_scores_dict
29
 
30
 
31
  def compute_scores(merge_file):
32
- dataset_score_dict = chatgpt_json(merge_file)
33
- dataset_weight = {
34
- 1:
35
- {
36
- "ActivityNet": 1,
37
- "MSVD": 1,
38
- "MSRVTT": 1,
39
- "TGIF": 1,
40
- "Youcook2": 1,
41
- "Ucfcrime": 1,
42
- "MOT": 0.5,
43
- },
44
-
45
- 2:
46
- {
47
- "TVQA": 1,
48
- "MV": 1,
49
- "NBA": 1,
50
- },
51
-
52
- 3:
53
- {
54
- "Driving-exam": 0.5,
55
- "Driving-decision-making": 1,
56
- "SQA3D": 1,
57
- }
58
-
59
- }
60
-
61
- # Video-exclusive Understanding score
62
- exclusive_understanding_weight = dataset_weight[1]
63
- weights_sum = sum(exclusive_understanding_weight.values())
64
- exclusive_understanding_score = 0
65
- # import ipdb; ipdb.set_trace()
66
- for dataset_name, weight in exclusive_understanding_weight.items():
67
- exclusive_understanding_score += weight * dataset_score_dict[dataset_name] / weights_sum * 100
68
-
69
- # Prior Knowledge-based Question-answer
70
- prior_QA_weight = dataset_weight[2]
71
- weights_sum = sum(prior_QA_weight.values())
72
- prior_QA_score = 0
73
- for dataset_name, weight in prior_QA_weight.items():
74
- prior_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
75
-
76
- # Comprehension and Decision-making
77
- com_and_dec_QA_weight = dataset_weight[3]
78
- weights_sum = sum(com_and_dec_QA_weight.values())
79
- com_and_dec_QA_score = 0
80
- for dataset_name, weight in com_and_dec_QA_weight.items():
81
- com_and_dec_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
82
-
83
- dataset_score_dict['Exclusive_understanding'] = exclusive_understanding_score
84
- dataset_score_dict['Prior_Knowledge'] = prior_QA_score
85
- dataset_score_dict['Comprehension_and_Decision-making'] = com_and_dec_QA_score
86
-
87
- # final score
88
- final_score = sum([exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score]) / 3
89
- dataset_score_dict['final_score'] = final_score
90
 
91
  # print(dataset_score_dict)
92
  # with open(args.score_output_file, 'w', encoding='utf-8') as f:
@@ -95,24 +62,19 @@ def compute_scores(merge_file):
95
  # ========================
96
  data = [
97
 
98
- ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
99
- "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime",
100
- "MOT", "TVQA", "MV", "NBA", "Driving-exam", "Driving-decision-making", "SQA3D"],
101
-
102
- [final_score, exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score,
103
- dataset_score_dict['ActivityNet'],
104
- dataset_score_dict["MSVD"],
105
- dataset_score_dict['MSRVTT'],
106
- dataset_score_dict['TGIF'],
107
- dataset_score_dict['Youcook2'],
108
- dataset_score_dict['Ucfcrime'],
109
- dataset_score_dict['MOT'],
110
- dataset_score_dict['TVQA'],
111
- dataset_score_dict['MV'],
112
- dataset_score_dict['NBA'],
113
- dataset_score_dict['Driving-exam'],
114
- dataset_score_dict['Driving-decision-making'],
115
- dataset_score_dict['SQA3D'],
116
  ],
117
  ]
118
 
 
8
  def chatgpt_json(merge_file):
9
  # chat results
10
  merge_data = merge_file.decode("utf-8")
11
+ merge_data = merge_data.replace(": true,", ": \"true\",")
12
+ merge_data = merge_data.replace(": false,", ": \"false\",")
13
  merge_data = eval(merge_data)
 
 
 
14
 
15
  dataset_scores_dict = {}
16
+ for dataset_name, dataset_results in merge_data.items():
17
 
18
+ correct, total_nums = 0, 0
19
+ for id in dataset_results:
20
+ for dim in dataset_results[id]:
21
+ for result in dataset_results[id][dim]:
22
+ correct += result['rating']
23
+ total_nums += 1
24
 
25
+ dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
26
+ # dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
27
  return dataset_scores_dict
28
 
29
 
30
  def compute_scores(merge_file):
31
+
32
+ merge_data = merge_file.decode("utf-8")
33
+ merge_data = merge_data.replace(": true,", ": \"true\",")
34
+ merge_data = merge_data.replace(": false,", ": \"false\",")
35
+ merge_data = eval(merge_data)
36
+
37
+ dataset_scores_dict = {}
38
+ total_correct, total_num = 0, 0
39
+ eval_dims = ['action', 'speed', 'direction', 'order', 'attribute_change', 'avg']
40
+ for dataset_name, dataset_results in merge_data.items():
41
+
42
+ dataset_correct, dataset_num = {dim: 0 for dim in eval_dims}, {dim: 0 for dim in eval_dims}
43
+ for id in dataset_results:
44
+ for dim in dataset_results[id]:
45
+ for result in dataset_results[id][dim]:
46
+ dataset_correct['avg'] += result['rating']
47
+ dataset_correct[dim] += result['rating']
48
+ dataset_num['avg'] += 1
49
+ dataset_num[dim] += 1
50
+
51
+ total_correct += dataset_correct['avg']
52
+ total_num += dataset_num['avg']
53
+ for dim in eval_dims:
54
+ dataset_scores_dict[f"{dim}_{dataset_name}"] = round(dataset_correct[dim] / dataset_num[dim] * 100, 2)
55
+
56
+ dataset_scores_dict["avg_all"] = round(total_correct / total_num * 100, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # print(dataset_score_dict)
59
  # with open(args.score_output_file, 'w', encoding='utf-8') as f:
 
62
  # ========================
63
  data = [
64
 
65
+ ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation",
66
+ "Action. Multi-Choice", "Action. Yes/No", "Action. Caption Matching", "Action. Caption Generation",
67
+ "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
68
+ "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
69
+ "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
70
+ "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"],
71
+
72
+ [dataset_scores_dict["avg_all"], dataset_scores_dict["avg_multi-choice"], dataset_scores_dict["avg_yes_no"], dataset_scores_dict["avg_caption_matching"], dataset_scores_dict["avg_captioning"],
73
+ dataset_scores_dict['action_multi-choice'], dataset_scores_dict['action_yes_no'], dataset_scores_dict['action_caption_matching'], dataset_scores_dict['action_captioning'],
74
+ dataset_scores_dict['speed_multi-choice'], dataset_scores_dict['speed_yes_no'], dataset_scores_dict['speed_caption_matching'], dataset_scores_dict['speed_captioning'],
75
+ dataset_scores_dict['direction_multi-choice'], dataset_scores_dict['direction_yes_no'], dataset_scores_dict['direction_caption_matching'], dataset_scores_dict['direction_captioning'],
76
+ dataset_scores_dict['order_multi-choice'], dataset_scores_dict['order_yes_no'], dataset_scores_dict['order_caption_matching'], dataset_scores_dict['order_captioning'],
77
+ dataset_scores_dict['attribute_change_multi-choice'], dataset_scores_dict['attribute_change_yes_no'], dataset_scores_dict['attribute_change_caption_matching'], dataset_scores_dict['attribute_change_captioning'],
 
 
 
 
 
78
  ],
79
  ]
80