Upload 4 files
Browse files- file/result.csv +2 -0
- file/result_task.csv +2 -0
- file/result_v2.csv +31 -29
- file/result_v2_task.csv +31 -29
file/result.csv
CHANGED
@@ -49,3 +49,5 @@ Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49
|
|
49 |
ImageLLM,[Weitu-VL-1.0](https://weitu.ai/),Weitu-13B,15B,Generate,69.2,74.2,50.5,78.2,77.2,76.9,67.6,68.7,57.8,74.2,78.2,53.5,55.7,43.4,51.2
|
50 |
ImageLLM,[LLaVA-7B + detection and grounding trained](https://llava-vl.github.io),Vicuna-7B,7B,Generate,0,67.3,0,74.5,70.9,69,62.3,57.4,51.9,69.1,77,41.9,0,0,0
|
51 |
ImageLLM,[MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2),MiniCPM-2B,2.8B,Generate,0,67.1,0,75.4,69,68.6,57.2,61.3,45.2,70.1,78.2,42.4,0,0,0
|
|
|
|
|
|
49 |
ImageLLM,[Weitu-VL-1.0](https://weitu.ai/),Weitu-13B,15B,Generate,69.2,74.2,50.5,78.2,77.2,76.9,67.6,68.7,57.8,74.2,78.2,53.5,55.7,43.4,51.2
|
50 |
ImageLLM,[LLaVA-7B + detection and grounding trained](https://llava-vl.github.io),Vicuna-7B,7B,Generate,0,67.3,0,74.5,70.9,69,62.3,57.4,51.9,69.1,77,41.9,0,0,0
|
51 |
ImageLLM,[MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2),MiniCPM-2B,2.8B,Generate,0,67.1,0,75.4,69,68.6,57.2,61.3,45.2,70.1,78.2,42.4,0,0,0
|
52 |
+
ImageLLM,[Claude-3-Opus](https://www.anthropic.com/news/claude-3-family),\-,-,Generate,40.9,42.1,36.6,51.2 ,42.5 ,39.6 ,39.9 ,35.3 ,40.9 ,42.3 ,58.0 ,33.3 ,37.1 ,32.1 ,41.3
|
53 |
+
ImageLLM,[Gemini-Pro-Vision](https://gemini.google.com/),\-,-,Generate,0,62.4,0,73.4,70.0,63.6,54.2,47.8,46.6,70.2,77.4,50.6,50.7,0.0,40.9
|
file/result_task.csv
CHANGED
@@ -49,3 +49,5 @@ Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,46
|
|
49 |
ImageLLM,[Weitu-VL-1.0](https://weitu.ai/),Weitu-13B,15B,Generate,65.2,70.3,50.1,78.2,77.2,76.9,67.6,68.7,57.8,74.2,78.2,53.5,55.7,43.4,51.2
|
50 |
ImageLLM,[LLaVA-7B + detection and grounding trained](https://llava-vl.github.io),Vicuna-7B,7B,Generate,0,63.8,0,74.5,70.9,69,62.3,57.4,51.9,69.1,77,41.9,0,0,0
|
51 |
ImageLLM,[MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2),MiniCPM-2B,2.8B,Generate,0,63,0,75.4,69,68.6,57.2,61.3,45.2,70.1,78.2,42.4,0,0,0
|
|
|
|
|
|
49 |
ImageLLM,[Weitu-VL-1.0](https://weitu.ai/),Weitu-13B,15B,Generate,65.2,70.3,50.1,78.2,77.2,76.9,67.6,68.7,57.8,74.2,78.2,53.5,55.7,43.4,51.2
|
50 |
ImageLLM,[LLaVA-7B + detection and grounding trained](https://llava-vl.github.io),Vicuna-7B,7B,Generate,0,63.8,0,74.5,70.9,69,62.3,57.4,51.9,69.1,77,41.9,0,0,0
|
51 |
ImageLLM,[MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2),MiniCPM-2B,2.8B,Generate,0,63,0,75.4,69,68.6,57.2,61.3,45.2,70.1,78.2,42.4,0,0,0
|
52 |
+
ImageLLM,[Claude-3-Opus](https://www.anthropic.com/news/claude-3-family),\-,-,Generate,41.1,42.6,36.8,51.2 ,42.5 ,39.6 ,39.9 ,35.3 ,40.9 ,42.3 ,58.0 ,33.3 ,37.1 ,32.1 ,41.3
|
53 |
+
ImageLLM,[Gemini-Pro-Vision](https://gemini.google.com/),\-,-,Generate,0,61.5,0,73.4,70.0,63.6,54.2,47.8,46.6,70.2,77.4,50.6,50.7,0.0,40.9
|
file/result_v2.csv
CHANGED
@@ -1,31 +1,33 @@
|
|
1 |
Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Video,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
|
2 |
-
[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,46.8,22.8,36
|
3 |
-
[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,52.4,25.8,36.5,48.6,36.1,0
|
4 |
-
[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,47.5,41.1,33
|
5 |
-
[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,42.4,32.5,32.6,40.2,34.3,0
|
6 |
-
[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,45
|
7 |
-
[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,36.6,29.3,33.2,35.8,21.9,0
|
8 |
-
[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,36.7,44.1,32.6,35.9,36.7,0
|
9 |
-
[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,36
|
10 |
-
[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,36.6,43.5,32.4,35.8,36.7,0
|
11 |
-
[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,36
|
12 |
-
[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,34.9,45.8,31
|
13 |
-
[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,38.6,38.7,31.1,36.9,29
|
14 |
-
[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,52.4,29.4,40.7,49.6,23.7,0
|
15 |
-
[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,50.3,37.4,34.3,46.6,38.5,0
|
16 |
-
[Qwen-VL-plus](https://github.com/QwenLM/Qwen-VL/tree/master?tab=readme-ov-file#qwen-vl-plus),Qwen-LM,-,PPL for A/B/C/D,71.9,70.3,46.7,66.1,29.6,0
|
17 |
-
[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,58.3,39.2,36.9,53.3,34.4,0
|
18 |
-
[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,38.8,54.5,32.9,37.5,42.6,0
|
19 |
-
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,65.4,49.8,44.9,60.6,30.2,0
|
20 |
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL for A/B/C/D,66.5,50,45.3,61.1,29,0,75.3,71.5,67.6,61.7,55.8,53.9,75.3,76.7,63.7,86.1,78.2,29.7,62.3,84.1,68.5,24.2,47.9,56.6,59.1,50.4,37.8,25.3,28.3,30.6,0,0,0
|
21 |
-
[SPHINXv1-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,68.5,37.7,40
|
22 |
-
[SPHINXv2-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,72.1,44.8,44.2,65.4,35.5,0
|
23 |
-
[Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,46.4,31.2,37.4,44.2,45.6,45.7,59
|
24 |
-
[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,31
|
25 |
-
[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,49.9,32.4,39
|
26 |
-
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0
|
27 |
-
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0
|
28 |
-
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0
|
29 |
-
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0
|
30 |
-
VisionLLaMA,LLaMA-13B,13B,Generate,66.1,61.7,49.4,62.1,0
|
31 |
-
MindAD,GPT,-,Generate,72.4,0,0,0,0,0,80,76.8,73.9,70.5,67.6,60.5,73.7,80.9,63.1,89.8,86.9,34.2,61.3,60.9,71,40.3,0,0,0,0,0,0,0,0,0,0,0
|
|
|
|
|
|
1 |
Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Video,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
|
2 |
+
[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,46.8,22.8,36,44.2,37.3,0,58.5,48.6,49,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22,17.8,38.6,42.5,37.7,36.2,22.9,40,30.6,0,0,0
|
3 |
+
[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,52.4,25.8,36.5,48.6,36.1,0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0,0,0
|
4 |
+
[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,47.5,41.1,33,44.2,28.4,0,53.6,43.9,49,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0,0,0
|
5 |
+
[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,42.4,32.5,32.6,40.2,34.3,0,53.8,47.5,38.3,34.2,42,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27,50,44.1,36.2,25.1,18.6,40,20.4,0,0,0
|
6 |
+
[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,45,25.7,34.3,42.5,39,0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25,19,46.7,39,38.7,27.4,28.6,45.8,22.5,0,0,0
|
7 |
+
[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,36.6,29.3,33.2,35.8,21.9,0,46.9,38.6,33.6,35.6,27.5,34.4,33,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44,37.8,38.2,20.9,33.5,19.2,28.6,0,0,0
|
8 |
+
[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,36.7,44.1,32.6,35.9,36.7,0,46.9,42.5,32,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0,0,0
|
9 |
+
[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,36,32,32.3,35.2,39,0,45.9,39.7,31.9,31.6,26.4,32,33,49.2,39.3,59.7,53,23.6,41.2,36.1,37.3,22,27.4,46.7,36.6,37.9,26,24.8,42.5,30.6,0,0,0
|
10 |
+
[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,36.6,43.5,32.4,35.8,36.7,0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0,0,0
|
11 |
+
[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,36,34.7,31.4,35.1,0,0,45.2,38.5,29.3,33,29.7,35.5,39.2,52,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0,0,0,0,0
|
12 |
+
[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,34.9,45.8,31,34.2,40.2,0,41.7,35.5,31.8,29.5,36.2,32,32,51.1,35.2,39.4,36.4,25,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0,0,0
|
13 |
+
[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,38.6,38.7,31.1,36.9,29,0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44,32.5,23.5,33.5,54.9,42,37.8,18.3,19.3,29.2,28.6,0,0,0
|
14 |
+
[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,52.4,29.4,40.7,49.6,23.7,0,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28,25.2,42.8,48.5,40.8,39.5,30,24.2,22.5,0,0,0
|
15 |
+
[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,50.3,37.4,34.3,46.6,38.5,0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0,0,0
|
16 |
+
[Qwen-VL-plus](https://github.com/QwenLM/Qwen-VL/tree/master?tab=readme-ov-file#qwen-vl-plus),Qwen-LM,-,PPL for A/B/C/D,71.9,70.3,46.7,66.1,29.6,0,76.6,77.7,76.3,65.1,65.8,55.9,73.2,77.9,61.8,97,97.2,39.5,73.4,75.8,51.7,38.6,66.7,81.8,51.8,54.5,29.3,48,28.3,32.7,0,0,0
|
17 |
+
[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,58.3,39.2,36.9,53.3,34.4,0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69,60.6,49.8,25,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0,0,0
|
18 |
+
[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,38.8,54.5,32.9,37.5,42.6,0,48.2,38.2,37.8,32.9,29,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0,0,0
|
19 |
+
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,65.4,49.8,44.9,60.6,30.2,0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0,0,0
|
20 |
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL for A/B/C/D,66.5,50,45.3,61.1,29,0,75.3,71.5,67.6,61.7,55.8,53.9,75.3,76.7,63.7,86.1,78.2,29.7,62.3,84.1,68.5,24.2,47.9,56.6,59.1,50.4,37.8,25.3,28.3,30.6,0,0,0
|
21 |
+
[SPHINXv1-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,68.5,37.7,40,61.7,32.5,0,75.5,72.4,75.1,63.1,67.6,50.4,64.9,76.7,60,81.5,82.4,21.8,60.3,58.5,65.5,32.6,35.9,43.4,52.4,41.2,33.9,26.1,33.3,30.6,0,0,0
|
22 |
+
[SPHINXv2-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,72.1,44.8,44.2,65.4,35.5,0,77.5,78.5,76.6,69,71,57.5,73.2,77.6,62.1,82.7,85.2,44.5,62.3,60.3,65.3,23.5,45.7,42.1,54.6,48.1,37.9,29.9,33.3,40.8,0,0,0
|
23 |
+
[Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,46.4,31.2,37.4,44.2,45.6,45.7,59,50,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
|
24 |
+
[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,31,27.8,30.7,31,40.3,42.8,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
|
25 |
+
[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,49.9,32.4,39,47.3,48,50.6,64,55,51.3,45.4,43.3,37.9,56.7,59.2,57,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
|
26 |
+
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0,0,0
|
27 |
+
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34,30.6,27.4,40,30.6,0,0,0
|
28 |
+
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0,44.1,37,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22,33.2,37.2,22.4,25,46.1,61.4,42.6,32.2,27,19,37.5,24.5,0,0,0
|
29 |
+
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52,35.2,44.9,43.4,23.8,33.2,37.2,26,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0,0,0
|
30 |
+
VisionLLaMA,LLaMA-13B,13B,Generate,66.1,61.7,49.4,62.1,0,0,74.4,70.8,68.4,58.9,58.2,50.5,69.1,79.2,59.8,83.6,86,32.1,62.8,55.6,62.9,30.3,63.5,56,58.3,49.2,43.8,42.3,0,0,0,0,0
|
31 |
+
MindAD,GPT,-,Generate,72.4,0,0,0,0,0,80,76.8,73.9,70.5,67.6,60.5,73.7,80.9,63.1,89.8,86.9,34.2,61.3,60.9,71,40.3,0,0,0,0,0,0,0,0,0,0,0
|
32 |
+
[Claude-3-Opus](https://www.anthropic.com/news/claude-3-family),\-,-,Generate,42,64.2,40.2,42.2,25.4,0,51.2 ,42.5 ,39.6 ,39.9 ,35.3 ,40.9 ,42.3 ,58.0 ,33.3 ,48.2 ,55.2 ,24.8 ,34.2 ,45.9 ,43.1 ,37.9 ,62.1 ,71.1 ,48.8 ,37.1 ,32.1 ,41.3 ,20.8 ,36.7 ,0.0 ,0.0 ,0.0
|
33 |
+
[Gemini-Pro-Vision](https://gemini.google.com/),\-,-,Generate,62.9,0,0,0,0,0,73.4,70.0,63.6,54.2,47.8,46.6,70.2,77.4,50.6,92.8,86.2,37.0,42.9,77.3,66.7,34.2,0.0,0.0,0.0,50.7,0.0,41.9,0.0,0.0,0.0,0.0,0.0
|
file/result_v2_task.csv
CHANGED
@@ -1,31 +1,33 @@
|
|
1 |
Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Video,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
|
2 |
-
[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,44.1,28.2,34.8,41
|
3 |
-
[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,45.5,29
|
4 |
-
[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,43.4,46
|
5 |
-
[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,40.6,38.5,31
|
6 |
-
[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,41.7,32.9,33.4,39.4,34.1,0
|
7 |
-
[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,37.4,34.3,32.6,36.2,23.9,0
|
8 |
-
[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,37.5,48.3,31.5,37.4,34.9,0
|
9 |
-
[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,37.6,37.1,31.3,36.4,36.6,0
|
10 |
-
[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,37.5,47.4,31.3,37.3,35.5,0
|
11 |
-
[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,39
|
12 |
-
[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,33.5,50.4,30.1,34.4,38.6,0
|
13 |
-
[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,41.3,44.2,29.4,39.4,28.9,0
|
14 |
-
[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,49.5,34
|
15 |
-
[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,46
|
16 |
-
[Qwen-VL-plus](https://github.com/QwenLM/Qwen-VL/tree/master?tab=readme-ov-file#qwen-vl-plus),Qwen-LM,-,PPL for A/B/C/D,69
|
17 |
-
[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,50.8,43
|
18 |
-
[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,37.9,52.5,31.4,38
|
19 |
-
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,64.2,52.2,42.8,59.2,32.1,0
|
20 |
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL for A/B/C/D,64.6,52.2,42.8,59.6,29.5,0,75.3,71.5,67.6,61.7,55.8,53.9,75.3,76.7,63.7,86.1,78.2,29.7,62.3,84.1,68.5,24.2,47.9,56.6,59.1,50.4,37.8,25.3,28.3,30.6,0,0,0
|
21 |
-
[SPHINXv1-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,63
|
22 |
-
[SPHINXv2-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,66.7,43.9,42.6,60.2,37
|
23 |
-
[Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,45.3,33.2,36.1,42.5,41.1,41.4,59
|
24 |
-
[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,30.8,31.6,29.9,30.7,35.6,33.9,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31
|
25 |
-
[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,46.5,35.5,37.6,43.9,43.4,52.3,64
|
26 |
-
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70
|
27 |
-
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41
|
28 |
-
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31
|
29 |
-
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28
|
30 |
-
VisionLLaMA,LLaMA-13B,13B,Generate,62.7,59.8,48.4,59.8,0
|
31 |
-
MindAD,GPT,-,Generate,68.2,0,0,0,0,0,80,76.8,73.9,70.5,67.6,60.5,73.7,80.9,63.1,89.8,86.9,34.2,61.3,60.9,71,40.3,0,0,0,0,0,0,0,0,0,0,0
|
|
|
|
|
|
1 |
Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Video,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
|
2 |
+
[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,44.1,28.2,34.8,41,35.3,0,58.5,48.6,49,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22,17.8,38.6,42.5,37.7,36.2,22.9,40,30.6,0,0,0
|
3 |
+
[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,45.5,29,35.7,42.2,35.7,0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0,0,0
|
4 |
+
[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,43.4,46,31.5,41.4,29.7,0,53.6,43.9,49,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0,0,0
|
5 |
+
[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,40.6,38.5,31,38.7,30.2,0,53.8,47.5,38.3,34.2,42,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27,50,44.1,36.2,25.1,18.6,40,20.4,0,0,0
|
6 |
+
[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,41.7,32.9,33.4,39.4,34.1,0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25,19,46.7,39,38.7,27.4,28.6,45.8,22.5,0,0,0
|
7 |
+
[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,37.4,34.3,32.6,36.2,23.9,0,46.9,38.6,33.6,35.6,27.5,34.4,33,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44,37.8,38.2,20.9,33.5,19.2,28.6,0,0,0
|
8 |
+
[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,37.5,48.3,31.5,37.4,34.9,0,46.9,42.5,32,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0,0,0
|
9 |
+
[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,37.6,37.1,31.3,36.4,36.6,0,45.9,39.7,31.9,31.6,26.4,32,33,49.2,39.3,59.7,53,23.6,41.2,36.1,37.3,22,27.4,46.7,36.6,37.9,26,24.8,42.5,30.6,0,0,0
|
10 |
+
[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,37.5,47.4,31.3,37.3,35.5,0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0,0,0
|
11 |
+
[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,39,40.7,29.8,37.5,0,0,45.2,38.5,29.3,33,29.7,35.5,39.2,52,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0,0,0,0,0
|
12 |
+
[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,33.5,50.4,30.1,34.4,38.6,0,41.7,35.5,31.8,29.5,36.2,32,32,51.1,35.2,39.4,36.4,25,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0,0,0
|
13 |
+
[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,41.3,44.2,29.4,39.4,28.9,0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44,32.5,23.5,33.5,54.9,42,37.8,18.3,19.3,29.2,28.6,0,0,0
|
14 |
+
[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,49.5,34,39.7,46.3,23.3,0,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28,25.2,42.8,48.5,40.8,39.5,30,24.2,22.5,0,0,0
|
15 |
+
[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,46,40.8,32.8,43.1,35.5,0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0,0,0
|
16 |
+
[Qwen-VL-plus](https://github.com/QwenLM/Qwen-VL/tree/master?tab=readme-ov-file#qwen-vl-plus),Qwen-LM,-,PPL for A/B/C/D,69,74.3,45.9,65.3,30.5,0,76.6,77.7,76.3,65.1,65.8,55.9,73.2,77.9,61.8,97,97.2,39.5,73.4,75.8,51.7,38.6,66.7,81.8,51.8,54.5,29.3,48,28.3,32.7,0,0,0
|
17 |
+
[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,50.8,43,35.8,47.3,30.8,0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69,60.6,49.8,25,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0,0,0
|
18 |
+
[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,37.9,52.5,31.4,38,40.3,0,48.2,38.2,37.8,32.9,29,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0,0,0
|
19 |
+
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,64.2,52.2,42.8,59.2,32.1,0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0,0,0
|
20 |
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL for A/B/C/D,64.6,52.2,42.8,59.6,29.5,0,75.3,71.5,67.6,61.7,55.8,53.9,75.3,76.7,63.7,86.1,78.2,29.7,62.3,84.1,68.5,24.2,47.9,56.6,59.1,50.4,37.8,25.3,28.3,30.6,0,0,0
|
21 |
+
[SPHINXv1-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,63,39.7,38.4,56.4,31.9,0,75.5,72.4,75.1,63.1,67.6,50.4,64.9,76.7,60,81.5,82.4,21.8,60.3,58.5,65.5,32.6,35.9,43.4,52.4,41.2,33.9,26.1,33.3,30.6,0,0,0
|
22 |
+
[SPHINXv2-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,66.7,43.9,42.6,60.2,37,0,77.5,78.5,76.6,69,71,57.5,73.2,77.6,62.1,82.7,85.2,44.5,62.3,60.3,65.3,23.5,45.7,42.1,54.6,48.1,37.9,29.9,33.3,40.8,0,0,0
|
23 |
+
[Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,45.3,33.2,36.1,42.5,41.1,41.4,59,50,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
|
24 |
+
[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,30.8,31.6,29.9,30.7,35.6,33.9,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
|
25 |
+
[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,46.5,35.5,37.6,43.9,43.4,52.3,64,55,51.3,45.4,43.3,37.9,56.7,59.2,57,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
|
26 |
+
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70,78.6,61.3,69.2,44.2,0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0,0,0
|
27 |
+
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41,33.4,37,35.3,0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34,30.6,27.4,40,30.6,0,0,0
|
28 |
+
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31,0,44.1,37,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22,33.2,37.2,22.4,25,46.1,61.4,42.6,32.2,27,19,37.5,24.5,0,0,0
|
29 |
+
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28,34.5,32.2,0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52,35.2,44.9,43.4,23.8,33.2,37.2,26,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0,0,0
|
30 |
+
VisionLLaMA,LLaMA-13B,13B,Generate,62.7,59.8,48.4,59.8,0,0,74.4,70.8,68.4,58.9,58.2,50.5,69.1,79.2,59.8,83.6,86,32.1,62.8,55.6,62.9,30.3,63.5,56,58.3,49.2,43.8,42.3,0,0,0,0,0
|
31 |
+
MindAD,GPT,-,Generate,68.2,0,0,0,0,0,80,76.8,73.9,70.5,67.6,60.5,73.7,80.9,63.1,89.8,86.9,34.2,61.3,60.9,71,40.3,0,0,0,0,0,0,0,0,0,0,0
|
32 |
+
[Claude-3-Opus](https://www.anthropic.com/news/claude-3-family),\-,-,Generate,42,66.6,39.8,43.8,28.8,0,51.2 ,42.5 ,39.6 ,39.9 ,35.3 ,40.9 ,42.3 ,58.0 ,33.3 ,48.2 ,55.2 ,24.8 ,34.2 ,45.9 ,43.1 ,37.9 ,62.1 ,71.1 ,48.8 ,37.1 ,32.1 ,41.3 ,20.8 ,36.7 ,0.0 ,0.0 ,0.0
|
33 |
+
[Gemini-Pro-Vision](https://gemini.google.com/),\-,-,Generate,61.9,0,0,0,0,0,73.4,70.0,63.6,54.2,47.8,46.6,70.2,77.4,50.6,92.8,86.2,37.0,42.9,77.3,66.7,34.2,0.0,0.0,0.0,50.7,0.0,41.9,0.0,0.0,0.0,0.0,0.0
|