File size: 7,763 Bytes
cde8217 c88de80 f18e692 c88de80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Video,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,46.8,22.8,36,44.2,37.3,0,58.5,48.6,49,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22,17.8,38.6,42.5,37.7,36.2,22.9,40,30.6,0,0,0
[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,52.4,25.8,36.5,48.6,36.1,0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0,0,0
[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,47.5,41.1,33,44.2,28.4,0,53.6,43.9,49,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0,0,0
[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,42.4,32.5,32.6,40.2,34.3,0,53.8,47.5,38.3,34.2,42,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27,50,44.1,36.2,25.1,18.6,40,20.4,0,0,0
[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,45,25.7,34.3,42.5,39,0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25,19,46.7,39,38.7,27.4,28.6,45.8,22.5,0,0,0
[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,36.6,29.3,33.2,35.8,21.9,0,46.9,38.6,33.6,35.6,27.5,34.4,33,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44,37.8,38.2,20.9,33.5,19.2,28.6,0,0,0
[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,36.7,44.1,32.6,35.9,36.7,0,46.9,42.5,32,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0,0,0
[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,36,32,32.3,35.2,39,0,45.9,39.7,31.9,31.6,26.4,32,33,49.2,39.3,59.7,53,23.6,41.2,36.1,37.3,22,27.4,46.7,36.6,37.9,26,24.8,42.5,30.6,0,0,0
[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,36.6,43.5,32.4,35.8,36.7,0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0,0,0
[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,36,34.7,31.4,35.1,0,0,45.2,38.5,29.3,33,29.7,35.5,39.2,52,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0,0,0,0,0
[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,34.9,45.8,31,34.2,40.2,0,41.7,35.5,31.8,29.5,36.2,32,32,51.1,35.2,39.4,36.4,25,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0,0,0
[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,38.6,38.7,31.1,36.9,29,0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44,32.5,23.5,33.5,54.9,42,37.8,18.3,19.3,29.2,28.6,0,0,0
[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,52.4,29.4,40.7,49.6,23.7,0,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28,25.2,42.8,48.5,40.8,39.5,30,24.2,22.5,0,0,0
[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,50.3,37.4,34.3,46.6,38.5,0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0,0,0
[Qwen-VL-plus](https://github.com/QwenLM/Qwen-VL/tree/master?tab=readme-ov-file#qwen-vl-plus),Qwen-LM,-,PPL for A/B/C/D,71.9,70.3,46.7,66.1,29.6,0,76.6,77.7,76.3,65.1,65.8,55.9,73.2,77.9,61.8,97,97.2,39.5,73.4,75.8,51.7,38.6,66.7,81.8,51.8,54.5,29.3,48,28.3,32.7,0,0,0
[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,58.3,39.2,36.9,53.3,34.4,0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69,60.6,49.8,25,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0,0,0
[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,38.8,54.5,32.9,37.5,42.6,0,48.2,38.2,37.8,32.9,29,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0,0,0
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,65.4,49.8,44.9,60.6,30.2,0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0,0,0
[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL for A/B/C/D,66.5,50,45.3,61.1,29,0,75.3,71.5,67.6,61.7,55.8,53.9,75.3,76.7,63.7,86.1,78.2,29.7,62.3,84.1,68.5,24.2,47.9,56.6,59.1,50.4,37.8,25.3,28.3,30.6,0,0,0
[SPHINXv1-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,68.5,37.7,40,61.7,32.5,0,75.5,72.4,75.1,63.1,67.6,50.4,64.9,76.7,60,81.5,82.4,21.8,60.3,58.5,65.5,32.6,35.9,43.4,52.4,41.2,33.9,26.1,33.3,30.6,0,0,0
[SPHINXv2-1k](https://github.com/Alpha-VLLM/LLaMA2-Accessory/tree/main/SPHINX),LLaMA-2-13B,13B,Generate,72.1,44.8,44.2,65.4,35.5,0,77.5,78.5,76.6,69,71,57.5,73.2,77.6,62.1,82.7,85.2,44.5,62.3,60.3,65.3,23.5,45.7,42.1,54.6,48.1,37.9,29.9,33.3,40.8,0,0,0
[Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,46.4,31.2,37.4,44.2,45.6,45.7,59,50,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,31,27.8,30.7,31,40.3,42.8,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,49.9,32.4,39,47.3,48,50.6,64,55,51.3,45.4,43.3,37.9,56.7,59.2,57,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0,0,0
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34,30.6,27.4,40,30.6,0,0,0
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0,44.1,37,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22,33.2,37.2,22.4,25,46.1,61.4,42.6,32.2,27,19,37.5,24.5,0,0,0
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52,35.2,44.9,43.4,23.8,33.2,37.2,26,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0,0,0
VisionLLaMA,LLaMA-13B,13B,Generate,66.1,61.7,49.4,62.1,0,0,74.4,70.8,68.4,58.9,58.2,50.5,69.1,79.2,59.8,83.6,86,32.1,62.8,55.6,62.9,30.3,63.5,56,58.3,49.2,43.8,42.3,0,0,0,0,0
MindAD,GPT,-,Generate,72.4,0,0,0,0,0,80,76.8,73.9,70.5,67.6,60.5,73.7,80.9,63.1,89.8,86.9,34.2,61.3,60.9,71,40.3,0,0,0,0,0,0,0,0,0,0,0
[Claude-3-Opus](https://www.anthropic.com/news/claude-3-family),\-,-,Generate,42,64.2,40.2,42.2,25.4,0,51.2 ,42.5 ,39.6 ,39.9 ,35.3 ,40.9 ,42.3 ,58.0 ,33.3 ,48.2 ,55.2 ,24.8 ,34.2 ,45.9 ,43.1 ,37.9 ,62.1 ,71.1 ,48.8 ,37.1 ,32.1 ,41.3 ,20.8 ,36.7 ,0.0 ,0.0 ,0.0
[Gemini-Pro-Vision](https://gemini.google.com/),\-,-,Generate,62.9,0,0,0,0,0,73.4,70.0,63.6,54.2,47.8,46.6,70.2,77.4,50.6,92.8,86.2,37.0,42.9,77.3,66.7,34.2,0.0,0.0,0.0,50.7,0.0,41.9,0.0,0.0,0.0,0.0,0.0
|