|
|
|
MODEL_INFO = ["Model Type", "Model", "Language Model"] |
|
TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"] |
|
AVG_INFO = ["Avg. Img", "Avg. Video", "Avg. All"] |
|
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"] |
|
CSV_DIR = "./file/result.csv" |
|
|
|
COLUMN_NAMES = MODEL_INFO + TASK_INFO |
|
DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192] |
|
|
|
UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5 & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\ |
|
LLM & Vicuna & Vicuna-7B &23.4 &30.7 &29.7 &30.9 &30.8 &28.6 &29.8 &18.5 &13.4 &27.3 &34.5 &23.8 \\ |
|
LLM & LLaMA & LLaMA-7B &26.3 &27.4 &26.2 &28.3 &25.1 &28.8 &19.2 &37.0 & 9.0 &33.0 &23.1 &26.2 \\ |
|
ImageLLM & BLIP-2 & Flan-T5-XL &59.1 &53.9 &49.2 &42.3 &43.2 &36.7 &55.7 &45.6 &25.9 &32.6 &47.5 &24.0 \\ |
|
ImageLLM & InstructBLIP & Flan-T5-XL &60.3 &58.5 &63.4 &40.6 &58.4 &38.7 &51.6 &45.9 &25.9 &33.1 &49.1 &27.1 \\ |
|
ImageLLM & InstructBLIP-Vicuna & Vicuna-7B &60.2 &58.9 &65.6 &43.6 &57.2 &40.3 &52.6 &47.7 &43.5 &34.5 &49.6 &23.1 \\ |
|
ImageLLM & LLaVA & LLaMA-7B &42.7 &34.9 &33.5 &28.4 &41.9 &30.8 &27.8 &46.8 &27.7 &29.7 &21.4 &19.1 \\ |
|
ImageLLM & MiniGPT-4 & Flan-T5-XL &56.3 &49.2 &45.8 &37.9 &45.3 &32.6 &47.4 &57.1 &11.8 &38.2 &24.5 &27.1 \\ |
|
ImageLLM & VPGTrans & LLaMA-7B &51.9 &44.1 &39.9 &36.1 &33.7 &36.4 &32.0 &53.2 &30.6 &39.5 &24.3 &31.9 \\ |
|
ImageLLM & MultiModal-GPT & LLaMA-7B &43.6 &37.9 &31.5 &30.8 &27.3 &30.1 &29.9 &51.4 &18.8 &36.9 &25.8 &24.0 \\ |
|
ImageLLM & Otter & LLaMA-7B &44.9 &38.6 &32.2 &30.9 &26.3 &31.8 &32.0 &51.4 &31.8 &37.9 &27.2 &24.8 \\ |
|
ImageLLM & OpenFlamingo & LLaMA-7B &43.9 &38.1 &31.3 &30.1 &27.3 &30.6 &29.9 &50.2 &20.0 &37.2 &25.4 &24.2 \\ |
|
ImageLLM & LLaMA-Adapter V2 & LLaMA-7B &45.2 &38.5 &29.3 &33.0 &29.7 &35.5 &39.2 &52.0 &24.7 &38.6 &18.5 &19.6 \\ |
|
ImageLLM & GVT & Vicuna-7B &41.7 &35.5 &31.8 &29.5 &36.2 &32.0 &32.0 &51.1 &27.1 &33.9 &25.4 &23.0 \\ |
|
ImageLLM & mPLUG-Owl & LLaMA-7B &49.7 &45.3 &32.5 &36.7 &27.3 &32.7 &44.3 &54.7 &28.8 &26.7 &17.9 &26.5 \\ |
|
VideoLLM & VideoChat & Vicuna-7B &47.1 &43.8 &34.9 &40.0 &32.8 &34.6 &42.3 &50.5 &17.7 &34.9 &36.4 &27.3 \\ |
|
VideoLLM & Video-ChatGPT & LLaMA-7B &37.2 &31.4 &33.2 &28.4 &35.5 &29.5 &23.7 &42.3 &25.9 &27.6 &21.3 &21.1 \\ |
|
VideoLLM & Valley & LLaMA-13B &39.3 &32.9 &31.6 &27.9 &24.2 &30.1 &27.8 &43.8 &11.8 &31.3 &23.2 &20.7 \\''' |
|
|
|
|
|
LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard |
|
|
|
Welcome to the leaderboard of the SEED-Bench! π |
|
This is a community where participants create multimodal language models and action generation algorithms to generate API function calls based goals described in natural lanugage! |
|
Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details. |
|
""" |
|
|
|
SUBMIT_INTRODUCTION = """# Submit Precautions |
|
1. Attain json file from our [github repository](https://github.com/AILab-CVC/SEED-Bench) |
|
2. If you want to revision model, please ensure 'Revision Model Name' align with what's in the leaderboard. |
|
3. Please ensure for right link for each submittion. Everyone could go to model's repository thought model name in the leaderboard. |
|
4. If you don't want to evaluate all dimension, not evaluated dimension performance and its corresponding average performance will set to 0. |
|
""" |
|
|
|
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models. |
|
We use accurancy(%) as the primary evaluation metric for each tasks. |
|
""" |
|
|
|
LEADERBORAD_INFO = """ |
|
Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation. |
|
In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench. |
|
SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality. |
|
We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes. |
|
Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation. |
|
We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding. |
|
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research. |
|
""" |
|
|
|
|
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r"""@article{li2023seed, |
|
title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension}, |
|
author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying}, |
|
journal={arXiv preprint arXiv:2307.16125}, |
|
year={2023} |
|
}""" |
|
|