Spaces:

lyx97
/

TempCompass

Running

App Files Files Community

lyx97 commited on May 23

Commit

6de388e

•

1 Parent(s): 5d57406

update

Browse files

Files changed (5) hide show

app.py +14 -3
constants.py +15 -11
file/result.csv +6 -13
file/result.csv.bak +0 -5
src/compute.py +50 -88

app.py CHANGED Viewed

@@ -86,7 +86,18 @@ def add_new_eval(
             input_data[14],
             input_data[15],
             input_data[16],
             ]
         csv_data.loc[col] = new_data
         # with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
         #     json.dump(new_data, f)
@@ -122,7 +133,7 @@ with block:
             # selection for column part:
             checkbox_group = gr.CheckboxGroup(
-                choices=TASK_INFO_v2,
                 value=AVG_INFO,
                 label="Select options",
                 interactive=True,
@@ -140,7 +151,7 @@ with block:
             def on_checkbox_group_change(selected_columns):
                 # pdb.set_trace()
-                selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
                 updated_data = get_all_df()[present_columns]
                 updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
@@ -157,7 +168,7 @@ with block:
                     )
                 # pdb.set_trace()
-                return filter_component.value
             # 将复选框组关联到处理函数
             checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)

             input_data[14],
             input_data[15],
             input_data[16],
+            input_data[17],
+            input_data[18],
+            input_data[19],
+            input_data[20],
+            input_data[21],
+            input_data[22],
+            input_data[23],
+            input_data[24],
             ]
+        print(len(new_data), col)
+        print(csv_data.loc[col-1])
+        print(model_name, model_type, model_size)
         csv_data.loc[col] = new_data
         # with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
         #     json.dump(new_data, f)
             # selection for column part:
             checkbox_group = gr.CheckboxGroup(
+                choices=TASK_INFO,
                 value=AVG_INFO,
                 label="Select options",
                 interactive=True,
             def on_checkbox_group_change(selected_columns):
                 # pdb.set_trace()
+                selected_columns = [item for item in TASK_INFO if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
                 updated_data = get_all_df()[present_columns]
                 updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
                     )
                 # pdb.set_trace()
+                return filter_component.constructor_args['value']
             # 将复选框组关联到处理函数
             checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)

constants.py CHANGED Viewed

@@ -1,21 +1,25 @@
 # this is .py for store constants
-MODEL_INFO = ["Model"]
-TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
-                "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
-                "TVQA", "MV", "NBA",
-                "Driving-exam", "Driving-decision-making", "SQA3D"]
-AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
 DATA_TITILE_TYPE = ["markdown",
-                    "number", "number", "number", "number", "number", "number", "number",
-                    "number", "number", "number",
-                    "number", "number", "number",
-                    "number", "number", "number", "number", ]
 CSV_DIR = "./file/result.csv"
 # COLUMN_NAMES = MODEL_INFO + TASK_INFO
-COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
 LEADERBORAD_INTRODUCTION = """
 Welcome to the leaderboard of TempCompass! 🏆

 # this is .py for store constants
+MODEL_INFO = ["Model", "Model Type", "Model Size"]
+TASK_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation",
+                "Action. Multi-Choice", "Action. Yes/No", "Action. Caption Matching", "Action. Caption Generation",
+                "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
+                "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
+                "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
+                "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"]
+AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
 DATA_TITILE_TYPE = ["markdown",
+                    "number", "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",]
 CSV_DIR = "./file/result.csv"
 # COLUMN_NAMES = MODEL_INFO + TASK_INFO
+COLUMN_NAMES = MODEL_INFO + TASK_INFO
 LEADERBORAD_INTRODUCTION = """
 Welcome to the leaderboard of TempCompass! 🏆

file/result.csv CHANGED Viewed

@@ -1,13 +1,6 @@
-Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
-Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
-[VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
-[Video-ChatGPT-7B](https://github.com/mbzuai-oryx/Video-ChatGPT),38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
-[Otter-7B](https://github.com/Luodian/Otter),37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
-[PandaGPT-7B](https://github.com/yxuansu/PandaGPT),37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
-[Valley-7B](https://github.com/RupertLuo/Valley),33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
-[mPLUG-owl-7B](https://github.com/X-PLUG/mPLUG-Owl),33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
-[Video-LLaMA-7B](https://github.com/DAMO-NLP-SG/Video-LLaMA),32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
-[Chat-UniVi-7B](https://github.com/PKU-YuanGroup/Chat-UniVi),35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907
-sphinx-v2,45.53190476,44.22571429,41.81666667,50.55333333,0.5307,0.6845,0.5395,0.5341,0.42,0.2759,0.1111,0.3645,0.4396,0.4504,0.4722,0.5564,0.488
-Gemini,49.598478632478624,50.63076923076923,47.93666666666667,50.228,0.585,0.6179,0.4742,0.5305,0.4769,0.5477,0.1176,0.4656,0.5318,0.4407,0.5285,0.4129
-llava_phi_2.7,43.41644444444445,42.97,37.54333333333334,49.736,0.5785,0.608,0.514,0.4542,0.4345,0.1483,0.1111,0.392,0.4763,0.258,0.5538,0.4535

+Model,Model Type,Model Size,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
+Random,Others,-,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
+[VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),Video-LLM,7B,26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
+Gemini,Video-LLM,-,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
+llava_phi_2.7,Image-LLM,-,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9
+,[],-,49.77,45.57,56.38,63.34,34.83,76.04,74.32,87.88,50.76,35.65,50.28,58.42,23.2,35.22,51.82,53.82,28.67,37.75,49.21,59.0,38.25,40.97,51.12,58.33,33.59

file/result.csv.bak DELETED Viewed

@@ -1,5 +0,0 @@
-Model,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
-Random,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
-[VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
-Gemini,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
-llava_phi_2.7,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9

src/compute.py CHANGED Viewed

@@ -8,85 +8,52 @@ import csv
 def chatgpt_json(merge_file):
     # chat results
     merge_data = merge_file.decode("utf-8")
     merge_data = eval(merge_data)
-    correct_answer_file = 'file/ANSWER.json'
-    with open(correct_answer_file, 'r', encoding='utf-8') as f:
-        correct_answer_data = json.load(f)
     dataset_scores_dict = {}
-    for dataset_name, item in merge_data.items():
-        total_nums = len(item)
-        correct = 0
-        # assert len(item) >= len(correct_answer_data[dataset_name]), f'Video-Bench-Input.json---{dataset_name}---is incomplete!'
-        for id, sub_item in item.items():
-            if sub_item['output_chatgpt_choice'] == correct_answer_data[dataset_name][id]['answer']:
-                correct += 1
-        # dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
-        dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
     return dataset_scores_dict
 def compute_scores(merge_file):
-    dataset_score_dict = chatgpt_json(merge_file)
-    dataset_weight = {
-        1:
-            {
-                "ActivityNet": 1,
-                "MSVD": 1,
-                "MSRVTT": 1,
-                "TGIF": 1,
-                "Youcook2": 1,
-                "Ucfcrime": 1,
-                "MOT": 0.5,
-            },
-        2:
-            {
-                "TVQA": 1,
-                "MV": 1,
-                "NBA": 1,
-            },
-        3:
-            {
-                "Driving-exam": 0.5,
-                "Driving-decision-making": 1,
-                "SQA3D": 1,
-            }
-    }
-    # Video-exclusive Understanding score
-    exclusive_understanding_weight = dataset_weight[1]
-    weights_sum = sum(exclusive_understanding_weight.values())
-    exclusive_understanding_score = 0
-    # import ipdb; ipdb.set_trace()
-    for dataset_name, weight in exclusive_understanding_weight.items():
-        exclusive_understanding_score += weight * dataset_score_dict[dataset_name] / weights_sum * 100
-    # Prior Knowledge-based Question-answer
-    prior_QA_weight = dataset_weight[2]
-    weights_sum = sum(prior_QA_weight.values())
-    prior_QA_score = 0
-    for dataset_name, weight in prior_QA_weight.items():
-        prior_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
-    # Comprehension and Decision-making
-    com_and_dec_QA_weight = dataset_weight[3]
-    weights_sum = sum(com_and_dec_QA_weight.values())
-    com_and_dec_QA_score = 0
-    for dataset_name, weight in com_and_dec_QA_weight.items():
-        com_and_dec_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum *100
-    dataset_score_dict['Exclusive_understanding'] = exclusive_understanding_score
-    dataset_score_dict['Prior_Knowledge'] = prior_QA_score
-    dataset_score_dict['Comprehension_and_Decision-making'] = com_and_dec_QA_score
-    # final score
-    final_score = sum([exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score]) / 3
-    dataset_score_dict['final_score'] = final_score
     # print(dataset_score_dict)
     # with open(args.score_output_file, 'w', encoding='utf-8') as f:
@@ -95,24 +62,19 @@ def compute_scores(merge_file):
     # ========================
     data = [
-        ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
-         "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime",
-         "MOT", "TVQA", "MV", "NBA", "Driving-exam", "Driving-decision-making", "SQA3D"],
-        [final_score, exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score,
-         dataset_score_dict['ActivityNet'],
-         dataset_score_dict["MSVD"],
-         dataset_score_dict['MSRVTT'],
-         dataset_score_dict['TGIF'],
-         dataset_score_dict['Youcook2'],
-         dataset_score_dict['Ucfcrime'],
-         dataset_score_dict['MOT'],
-         dataset_score_dict['TVQA'],
-         dataset_score_dict['MV'],
-         dataset_score_dict['NBA'],
-         dataset_score_dict['Driving-exam'],
-         dataset_score_dict['Driving-decision-making'],
-         dataset_score_dict['SQA3D'],
          ],
     ]

 def chatgpt_json(merge_file):
     # chat results
     merge_data = merge_file.decode("utf-8")
+    merge_data = merge_data.replace(": true,", ": \"true\",")
+    merge_data = merge_data.replace(": false,", ": \"false\",")
     merge_data = eval(merge_data)
     dataset_scores_dict = {}
+    for dataset_name, dataset_results in merge_data.items():
+        correct, total_nums = 0, 0
+        for id in dataset_results:
+            for dim in dataset_results[id]:
+                for result in dataset_results[id][dim]:
+                    correct += result['rating']
+                    total_nums += 1
+        dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
+        # dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
     return dataset_scores_dict
 def compute_scores(merge_file):
+    merge_data = merge_file.decode("utf-8")
+    merge_data = merge_data.replace(": true,", ": \"true\",")
+    merge_data = merge_data.replace(": false,", ": \"false\",")
+    merge_data = eval(merge_data)
+    dataset_scores_dict = {}
+    total_correct, total_num = 0, 0
+    eval_dims = ['action', 'speed', 'direction', 'order', 'attribute_change', 'avg']
+    for dataset_name, dataset_results in merge_data.items():
+        dataset_correct, dataset_num = {dim: 0 for dim in eval_dims}, {dim: 0 for dim in eval_dims}
+        for id in dataset_results:
+            for dim in dataset_results[id]:
+                for result in dataset_results[id][dim]:
+                    dataset_correct['avg'] += result['rating']
+                    dataset_correct[dim] += result['rating']
+                    dataset_num['avg'] += 1
+                    dataset_num[dim] += 1
+        total_correct += dataset_correct['avg']
+        total_num += dataset_num['avg']
+        for dim in eval_dims:
+            dataset_scores_dict[f"{dim}_{dataset_name}"] = round(dataset_correct[dim] / dataset_num[dim] * 100, 2)
+    dataset_scores_dict["avg_all"] = round(total_correct / total_num * 100, 2)
     # print(dataset_score_dict)
     # with open(args.score_output_file, 'w', encoding='utf-8') as f:
     # ========================
     data = [
+        ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation",
+         "Action. Multi-Choice", "Action. Yes/No", "Action. Caption Matching", "Action. Caption Generation",
+                "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
+                "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
+                "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
+                "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"],
+        [dataset_scores_dict["avg_all"], dataset_scores_dict["avg_multi-choice"], dataset_scores_dict["avg_yes_no"], dataset_scores_dict["avg_caption_matching"], dataset_scores_dict["avg_captioning"],
+         dataset_scores_dict['action_multi-choice'], dataset_scores_dict['action_yes_no'], dataset_scores_dict['action_caption_matching'], dataset_scores_dict['action_captioning'],
+         dataset_scores_dict['speed_multi-choice'], dataset_scores_dict['speed_yes_no'], dataset_scores_dict['speed_caption_matching'], dataset_scores_dict['speed_captioning'],
+         dataset_scores_dict['direction_multi-choice'], dataset_scores_dict['direction_yes_no'], dataset_scores_dict['direction_caption_matching'], dataset_scores_dict['direction_captioning'],
+         dataset_scores_dict['order_multi-choice'], dataset_scores_dict['order_yes_no'], dataset_scores_dict['order_caption_matching'], dataset_scores_dict['order_captioning'],
+         dataset_scores_dict['attribute_change_multi-choice'], dataset_scores_dict['attribute_change_yes_no'], dataset_scores_dict['attribute_change_caption_matching'], dataset_scores_dict['attribute_change_captioning'],
          ],
     ]