Spaces:
Running
Running
\begin{table}[t] | |
\centering | |
\caption{Evaluation of three types of multimodal judges across four perspectives on \algname dataset. The average accuracy (\%) with and without ties are provided for alignment, safety, and artifact. We evaluate preference biases over three metrics, i.e. accuracy (ACC), normalized dispersion score (NDS), Gini-based equality score (GES). The best performance across all models is bolded.} | |
\setlength{\tabcolsep}{2pt} | |
\renewcommand{\arraystretch}{0.9} | |
\resizebox{1.0\linewidth}{!}{% | |
\begin{tabular}{l|cc|cc|cc|ccc} | |
\toprule | |
& \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\ | |
& Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & ACC & NDS & GES \\ | |
\midrule | |
CLIP-v1$^\diamondsuit$ & $38.1$ & $59.5$ & $12.7$ & $33.3$ & $34.4$ & $68.4$ & $57.4$ & $76.3$ & $86.9$ \\ | |
BLIP-v2$^\diamondsuit$ & $17.3$ & $38.8$ & $44.0$ & $65.6$ & $7.5$ & $36.5$ & $68.7$ & $83.7$ & $91.3$ \\ | |
PickScore-v1$^\diamondsuit$ & $58.8$ & $64.6$ & \bf 37.2 & $42.2$ & $83.8$ & $89.6$ & $31.0$ & $66.5$ & $81.1$ \\ | |
HPS-v2.1$^\diamondsuit$ & $47.3$ & \bf 70.1 & $18.8$ & $41.3$ & $67.3$ & $93.5$ & $55.0$ & $77.9$ & $87.6$ \\ | |
ImageReward$^\diamondsuit$ & $50.9$ & $64.7$ & $24.9$ & $38.7$ & $63.5$ & $81.8$ & $40.9$ & $73.7$ & $85.3$ \\ | |
Aesthetics$^\diamondsuit$ & $32.4$ & $52.7$ & $27.0$ & $53.6$ & $69.6$ & $92.5$ & $61.4$ & $85.7$ & $92.1$ \\ | |
\midrule | |
LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & $24.8$ & $50.2$ & $12.4$ & $51.6$ & 83.7 & 70.4 & 88.7 \\ | |
LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & $30.7$ & $60.7$ & $23.3$ & $61.2$ & 69.7 & 74.3 & 88.6 \\ | |
LLaVA-1.6-mistral-7b$^\heartsuit$ & $31.3$ & $62.7$ & $15.2$ & $40.9$ & $45.8$ & $73.2$ & 69.9 & 64.3 & 85.4 \\ | |
LLaVA-1.6-vicuna-13b$^\heartsuit$ & $29.1$ & $60.3$ & $27.9$ & $45.6$ & $36.8$ & $62.5$ & 56.3 & 64.0 & 82.7 \\ | |
Instructblip-7b$^\heartsuit$ & $17.1$ & $49.8$ & $26.4$ & $46.9$ & $25.2$ & $64.1$ & 53.1 & 80.8 & 91.2 \\ | |
MiniGPT4-v2$^\heartsuit$ & $32.8$ & $51.2$ & $25.7$ & $60.1$ & $36.7$ & $47.8$ & 32.6 & 67.0 & 83.3 \\ | |
Prometheus-Vision-7b$^\heartsuit$ & $18.8$ & $63.9$ & $7.1$ & $58.8$ & $23.4$ & $67.7$ & 49.5 & 43.4 & 74.4 \\ | |
Prometheus-Vision-13b$^\heartsuit$ & $11.8$ & $64.3$ & $3.6$ & $71.4$ & $8.7$ & $67.9$ & 66.3 & 46.3 & 76.8 \\ | |
% Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & $6.8$ & $7.1$ & $5.7$ & $7.1$ & 71.9 & 62.8 & 86.2 \\ | |
% Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & $5.9$ & $6.0$ & $91.8$ & $92.7$ & 25.4 & 69.6 & 84.3 \\ | |
% Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.7$ & $52.0$ & $49.0$ & $74.7$ & 42.1 & 58.7 & 79.4 \\ | |
Qwen-VL-Chat$^\spadesuit$ & $52.1$ & $31.6$ & $26.8$ & $7.1$ & $23.6$ & $24.6$ & 71.9 & 62.8 & 86.2 \\ | |
Internvl-chat-v1-5$^\spadesuit$ & $55.3$ & $67.6$ & $6.3$ & $60.0$ & $66.3$ & $65.1$ & 25.4 & 69.6 & 84.3 \\ | |
Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.6$ & $52.0$ & $46.1$ & $68.9$ & 42.1 & 58.7 & 79.4 \\ | |
\midrule | |
GPT-4-vision$^\clubsuit$ & $66.1$ & $67.0$ & $26.5$ & $97.6$ & $90.4$ & $96.5$ & \bf 79.0 & 80.4 & \bf 93.2 \\ | |
GPT-4o$^\clubsuit$ & $61.5$ & $62.5$ & $35.3$ & \bf 100.0 & \bf 97.6 & \bf 98.7 & 65.8 & \bf 82.5 & 92.8 \\ | |
Gemini Ultra$^\clubsuit$ & \bf 67.2 & $69.0$ & $13.1$ & $95.1$ & $55.7$ & $96.7$ & 55.6 & 75.3 & 88.6 \\ | |
Claude 3 Opus$^\clubsuit$ & $57.1$ & $55.9$ & $13.4$ & $78.9$ & $11.9$ & $70.4$ & 57.7 & 65.6 & 85.0 \\ | |
% \midrule | |
% Random & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 50.0 \\ | |
\bottomrule | |
\end{tabular}% | |
\vspace{-0.2cm} | |
} | |
\label{exp:main_result} | |
\end{table} | |