Spaces:

lyx97
/

TempCompass

Running

App Files Files Community

lyx97 commited on May 23

Commit

5d57406

•

1 Parent(s): b23950f

update

Browse files

Files changed (11) hide show

app.py +204 -306
constants.py +94 -0
file/example_eval_results/caption_matching.json +0 -0
file/example_eval_results/captioning.json +0 -0
file/example_eval_results/merged_result.json +0 -0
file/example_eval_results/multi-choice.json +0 -0
file/example_eval_results/yes_no.json +0 -0
file/result.csv +13 -0
file/result.csv.bak +5 -0
merge_eval_result.py +14 -0
src/compute.py +121 -0

app.py CHANGED Viewed

@@ -1,334 +1,233 @@
-import subprocess
 import gradio as gr
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-leaderboard_df = original_df.copy()
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-# Searching and filtering
-def update_table(
-    hidden_df: pd.DataFrame,
-    columns: list,
-    type_query: list,
-    precision_query: str,
-    size_query: list,
-    show_deleted: bool,
-    query: str,
 ):
-    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
-    filtered_df = filter_queries(query, filtered_df)
-    df = select_columns(filtered_df, columns)
     return df
-def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
-    return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
-def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
-    always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
-        AutoEvalColumn.model.name,
-    ]
-    # We use COLS to maintain sorting
-    filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
-    ]
-    return filtered_df
-def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
-    final_df = []
-    if query != "":
-        queries = [q.strip() for q in query.split(";")]
-        for _q in queries:
-            _q = _q.strip()
-            if _q != "":
-                temp_filtered_df = search_table(filtered_df, _q)
-                if len(temp_filtered_df) > 0:
-                    final_df.append(temp_filtered_df)
-        if len(final_df) > 0:
-            filtered_df = pd.concat(final_df)
-            filtered_df = filtered_df.drop_duplicates(
-                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
             )
-    return filtered_df
-def filter_models(
-    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
-) -> pd.DataFrame:
-    # Show all models
-    if show_deleted:
-        filtered_df = df
-    else:  # Show only still on the hub models
-        filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
-    type_emoji = [t[0] for t in type_query]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
-    numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
-    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
-    mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
-    filtered_df = filtered_df.loc[mask]
-    return filtered_df
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 TempCompass Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                            show_label=False,
-                            elem_id="search-bar",
-                        )
-                    with gr.Row():
-                        shown_columns = gr.CheckboxGroup(
-                            choices=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden
-                            ],
-                            value=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
-                            ],
-                            label="Select columns to show",
-                            elem_id="column-select",
-                            interactive=True,
-                        )
-                    with gr.Row():
-                        deleted_models_visibility = gr.Checkbox(
-                            value=False, label="Show gated/private/deleted models", interactive=True
-                        )
-                with gr.Column(min_width=320):
-                    #with gr.Box(elem_id="box-filter"):
-                    filter_columns_type = gr.CheckboxGroup(
-                        label="Model types",
-                        choices=[t.to_str() for t in ModelType],
-                        value=[t.to_str() for t in ModelType],
-                        interactive=True,
-                        elem_id="filter-columns-type",
-                    )
-                    filter_columns_precision = gr.CheckboxGroup(
-                        label="Precision",
-                        choices=[i.value.name for i in Precision],
-                        value=[i.value.name for i in Precision],
-                        interactive=True,
-                        elem_id="filter-columns-precision",
-                    )
-                    filter_columns_size = gr.CheckboxGroup(
-                        label="Model sizes (in billions of parameters)",
-                        choices=list(NUMERIC_INTERVALS.keys()),
-                        value=list(NUMERIC_INTERVALS.keys()),
-                        interactive=True,
-                        elem_id="filter-columns-size",
-                    )
-            leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-                    + shown_columns.value
-                ],
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
-                datatype=TYPES,
-                elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
-            )
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
-                datatype=TYPES,
-                visible=False,
-            )
-            search_bar.submit(
-                update_table,
-                [
-                    hidden_leaderboard_table_for_search,
-                    shown_columns,
-                    filter_columns_type,
-                    filter_columns_precision,
-                    filter_columns_size,
-                    deleted_models_visibility,
-                    search_bar,
-                ],
-                leaderboard_table,
-            )
-            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
-                selector.change(
-                    update_table,
-                    [
-                        hidden_leaderboard_table_for_search,
-                        shown_columns,
-                        filter_columns_type,
-                        filter_columns_precision,
-                        filter_columns_size,
-                        deleted_models_visibility,
-                        search_bar,
-                    ],
-                    leaderboard_table,
-                    queue=True,
                 )
-        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
             with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                         multiselect=False,
                         value=None,
                         interactive=True,
                     )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
@@ -339,7 +238,6 @@ with demo:
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
 import gradio as gr
 import pandas as pd
+import re
+import pdb
+import tempfile
+from constants import *
+from src.compute import compute_scores
+global data_component, filter_component
+def validate_model_size(s):
+    pattern = r'^\d+B$|^-$'
+    if re.match(pattern, s):
+        return s
+    else:
+        return '-'
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+def add_new_eval(
+    input_file,
+    model_name_textbox: str,
+    revision_name_textbox: str,
+    model_link: str,
+    model_type: str,
+    model_size: str,
 ):
+    if input_file is None:
+        return "Error! Empty file!"
+    else:
+        model_size = validate_model_size(model_size)
+        input_file = compute_scores(input_file)
+        input_data = input_file[1]
+        input_data = [float(i) for i in input_data]
+        csv_data = pd.read_csv(CSV_DIR)
+        if revision_name_textbox == '':
+            col = csv_data.shape[0]
+            model_name = model_name_textbox
+            name_list = [name.split(']')[0][1:] if name.endswith(')') else name for name in csv_data['Model']]
+            print(name_list)
+            print(model_name)
+            assert model_name not in name_list
+        else:
+            model_name = revision_name_textbox
+            model_name_list = csv_data['Model']
+            name_list = [name.split(']')[0][1:] if name.endswith(')') else name for name in model_name_list]
+            if revision_name_textbox not in name_list:
+                col = csv_data.shape[0]
+            else:
+                col = name_list.index(revision_name_textbox)
+        if model_link == '':
+            model_name = model_name  # no url
+        else:
+            model_name = '[' + model_name + '](' + model_link + ')'
+        # add new data
+        new_data = [
+            model_name,
+            model_type,
+            model_size,
+            input_data[0],
+            input_data[1],
+            input_data[2],
+            input_data[3],
+            input_data[4],
+            input_data[5],
+            input_data[6],
+            input_data[7],
+            input_data[8],
+            input_data[9],
+            input_data[10],
+            input_data[11],
+            input_data[12],
+            input_data[13],
+            input_data[14],
+            input_data[15],
+            input_data[16],
+            ]
+        csv_data.loc[col] = new_data
+        # with open(f'./file/{model_name}.json','w' ,encoding='utf-8') as f:
+        #     json.dump(new_data, f)
+        csv_data.to_csv(CSV_DIR, index=False)
+    return 0
+def get_baseline_df():
+    # pdb.set_trace()
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by="Avg. All", ascending=False)
+    present_columns = MODEL_INFO + checkbox_group.value
+    df = df[present_columns]
     return df
+def get_all_df():
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by="Avg. All", ascending=False)
+    return df
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        LEADERBORAD_INTRODUCTION
+    )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 TempCompass Benchmark", elem_id="video-benchmark-tab-table", id=0):
+            gr.Markdown(
+                TABLE_INTRODUCTION
             )
+            # selection for column part:
+            checkbox_group = gr.CheckboxGroup(
+                choices=TASK_INFO_v2,
+                value=AVG_INFO,
+                label="Select options",
+                interactive=True,
+            )
+            # 创建数据帧组件
+            data_component = gr.components.Dataframe(
+                value=get_baseline_df,
+                headers=COLUMN_NAMES,
+                type="pandas",
+                datatype=DATA_TITILE_TYPE,
                 interactive=False,
                 visible=True,
                 )
+            def on_checkbox_group_change(selected_columns):
+                # pdb.set_trace()
+                selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
+                present_columns = MODEL_INFO + selected_columns
+                updated_data = get_all_df()[present_columns]
+                updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
+                updated_headers = present_columns
+                update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
+                filter_component = gr.components.Dataframe(
+                    value=updated_data,
+                    headers=updated_headers,
+                    type="pandas",
+                    datatype=update_datatype,
+                    interactive=False,
+                    visible=True,
+                    )
+                # pdb.set_trace()
+                return filter_component.value
+            # 将复选框组关联到处理函数
+            checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
+        '''
+        # table 2
+        with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
+            gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
+        '''
+        # table 3
+        with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
+            gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
             with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
+                    model_name_textbox = gr.Textbox(
+                        label="Model name", placeholder="Chat-UniVi-7B"
+                        )
+                    revision_name_textbox = gr.Textbox(
+                        label="Revision Model Name", placeholder="Chat-UniVi-7B"
+                    )
+                    model_link = gr.Textbox(
+                        label="Model Link", placeholder="https://github.com/PKU-YuanGroup/Chat-UniVi"
+                    )
                     model_type = gr.Dropdown(
+                        choices=[
+                            "LLM",
+                            "ImageLLM",
+                            "VideoLLM",
+                            "Other",
+                        ],
                         label="Model type",
                         multiselect=False,
                         value=None,
                         interactive=True,
                     )
+                    model_size = gr.Textbox(
+                        label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
                     )
+            with gr.Column():
+                input_file = gr.File(label="Click to Upload a json File", type='binary')
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                submit_button.click(
+                    add_new_eval,
+                    inputs=[
+                        input_file,
+                        model_name_textbox,
+                        revision_name_textbox,
+                        model_link,
+                        model_type,
+                        model_size,
+                    ],
+                    # outputs = submission_result,
+                )
+    with gr.Row():
+        data_run = gr.Button("Refresh")
+        data_run.click(
+            get_baseline_df, outputs=data_component
+        )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 show_copy_button=True,
             )
+    # block.load(get_baseline_df, outputs=data_title)
+block.launch()

constants.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# this is .py for store constants
+MODEL_INFO = ["Model"]
+TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
+                "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
+                "TVQA", "MV", "NBA",
+                "Driving-exam", "Driving-decision-making", "SQA3D"]
+AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
+DATA_TITILE_TYPE = ["markdown",
+                    "number", "number", "number", "number", "number", "number", "number",
+                    "number", "number", "number",
+                    "number", "number", "number",
+                    "number", "number", "number", "number", ]
+CSV_DIR = "./file/result.csv"
+# COLUMN_NAMES = MODEL_INFO + TASK_INFO
+COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
+LEADERBORAD_INTRODUCTION = """
+Welcome to the leaderboard of TempCompass! 🏆
+TempCompass is a benchmark to evaluate the temporal perception ability of Video LLMs. It consists of 410 videos and 7,540 task instructions, covering 11 temporal aspects and 4 task types. Please refer to [our paper](https://arxiv.org/abs/2403.00476) for more details.
+"""
+SUBMIT_INTRODUCTION = """
+# TempCompass Leaderboard
+Welcome to the leaderboard of the Video-Bench! 🏆
+## Submit Instruction
+Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
+You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
+For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
+```python
+{
+    "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
+    "gt-answer": "D. fighting",
+    "video-llm-prediction": "D",
+    "match_success": true,  # whether the video-llm-prediction can be assessed by rule-based matching
+    "rating": 1
+}
+```
+For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
+```python
+{
+    "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
+    "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
+    "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
+    "gt-answer": "A. dunking a basketball",
+    "rating": 0
+}
+```
+### Submit Example
+For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
+1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
+2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
+3. Select ‘ImageLLM’ in ‘Model Type’.
+4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
+5. Fill in ‘7B’ in ‘Model size’.
+6. Upload `<task_type>.json`.
+7. Click the ‘Submit Eval’ button.
+8. Click ‘Refresh’ to obtain the uploaded leaderboard.
+"""
+TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
+        We use accurancy(%) as the primary evaluation metric for each tasks.
+    """
+LEADERBORAD_INFO = """
+      Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
+      In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
+      SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
+      We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
+      Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
+      We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
+      By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+@article{liu2024tempcompass,
+  title   = {TempCompass: Do Video LLMs Really Understand Videos?},
+  author  = {Yuanxin Liu and Shicheng Li and Yi Liu and Yuxiang Wang and Shuhuai Ren and Lei Li and Sishuo Chen and Xu Sun and Lu Hou},
+  year    = {2024},
+  journal = {arXiv preprint arXiv: 2403.00476}
+}
+"""

file/example_eval_results/caption_matching.json ADDED Viewed