Spaces:

MJ-Bench
/

MJ-Bench-Leaderboard

Running

File size: 4,062 Bytes

b650828


\begin{table}[t]
    \centering
    \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
    \resizebox{1.0\linewidth}{!}{%
    \begin{tabular}{c|cccc|cccc}
    \toprule
         & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
         & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
         \midrule
         CLIP-v1$^\diamondsuit$ & $\bf 89.7$ & $\bf 96.6$ & $\bf 97.6$ & \cellcolor{skyblue} $\bf 94.4$ & $20.8$ & $4.50$ & $16.6$ & \cellcolor{skyblue} $7.90$  \\
         BLIP-v2$^\diamondsuit$ & $6.90$ & $0.00$ & $4.80$ &  \cellcolor{skyblue} $4.50$ & $58.4$ & $51.1$ & $35.7$ & \cellcolor{skyblue} $49.1$  \\
         PickScore-v1$^\diamondsuit$ & $89.7$ & $82.8$ & $88.1$ & \cellcolor{skyblue} $86.5$ & $3.10$ & $48.2$ & $2.10$ & \cellcolor{skyblue} $32.2$  \\
         HPS-v2.1$^\diamondsuit$ & $89.7$ & $86.2$ & $85.7$ & \cellcolor{skyblue} $87.6$ & $1.10$ & $30.8$ & $0.6$ & \cellcolor{skyblue} $15.1$   \\
         ImageReward$^\diamondsuit$ & $96.6$ & $96.6$ & $95.2$ & \cellcolor{skyblue} $95.5$ & $31.1$ & $10.2$ & $27.4$ & \cellcolor{skyblue} $18.2$  \\
         Aesthetics$^\diamondsuit$ & $51.7$ & $58.6$ & $64.3$ & \cellcolor{skyblue}  $57.3$& $14.6$ & $\bf 55.2$ & $14.2$ & \cellcolor{skyblue} $37.5$  \\
         \midrule
         LLaVA-1.5-7b$^\heartsuit$ & $44.8$ & $41.4$ & $47.6$ & \cellcolor{skyblue} $43.8$ & $35.7$ & $21.2$ & $17.6$ & \cellcolor{skyblue} $26.3$ \\
         LLaVA-1.5-13b$^\heartsuit$ & $31.0$ & $31.0$ & $40.5$ & \cellcolor{skyblue} $33.7$ & $40.8$ & $29.9$ & $33.6$ & \cellcolor{skyblue} $34.7$ \\
         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $24.1$ & $19.0$ & \cellcolor{skyblue} $21.3$ & $35.7$ & $14.1$ & $23.3$ & \cellcolor{skyblue} $25.6$ \\
         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $44.8$ & $37.9$ & $52.4$ & \cellcolor{skyblue} $43.8$ & $40.9$ & $25.1$ & $27.8$ & \cellcolor{skyblue} $36.5$ \\
         Instructblip-7b$^\heartsuit$ & $31.0$ & $34.5$ & $40.5$ & \cellcolor{skyblue} $39.3$ & $36.9$ & $24.2$ & $30.6$ & \cellcolor{skyblue} $33.7$ \\
         MiniGPT4-v2$^\heartsuit$ & $41.4$ & $62.1$ & $42.9$ & \cellcolor{skyblue} $48.3$ & $39.6$ & $21.4$ & $36.5$ & \cellcolor{skyblue} $32.6$ \\
          Prometheus-Vision-7b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $10.3$ & $6.80$ & $4.30$ & \cellcolor{skyblue} $7.10$ \\
          Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $6.50$ & $4.10$ & $4.20$ & \cellcolor{skyblue} $5.30$ \\
         Qwen-VL-Chat$^\spadesuit$ & $27.6$ & $13.8$ & $31.0$ & \cellcolor{skyblue} $24.7$ & $18.9$ & $7.60$ & $6.30$ & \cellcolor{skyblue} $11.6$ \\
         Internvl-chat-v1-5$^\spadesuit$ & $34.5$ & $10.3$ & $28.6$ & \cellcolor{skyblue} $25.8$ & $23.3$ & $10.6$ & $7.20$ & \cellcolor{skyblue} $16.2$ \\
         Idefics2-8b$^\spadesuit$ & $58.6$ & $44.8$ & $57.1$ & \cellcolor{skyblue} $52.8$ & $32.9$ & $13.2$ & $19.5$ & \cellcolor{skyblue} $20.2$ \\
         \midrule
         GPT-4-vision$^\clubsuit$ & $75.9$ & $69.0$ & $81.0$ & \cellcolor{skyblue} $76.4$ & $69.5$ & $43.2$ & $32.5$ & \cellcolor{skyblue} $44.1$ \\
         GPT-4o$^\clubsuit$ & $86.2$ & $\bf 96.6$ & $95.2$ & \cellcolor{skyblue} $92.1$ & $\bf 72.3$ & $51.7$ & $\bf 38.9$ & \cellcolor{skyblue} $\bf 54.3$ \\
         Gemini Ultra$^\clubsuit$ & $65.5$ & $41.4$ & $78.6$ & \cellcolor{skyblue} $64.0$ & $31.6$ & $19.1$ & $10.3$ & \cellcolor{skyblue} $22.7$ \\
         Claude 3 Opus$^\clubsuit$ & $62.1$ & $37.9$ & $50.0$ & \cellcolor{skyblue} $50.6$ & $10.5$ & $6.20$ & $3.60$ & \cellcolor{skyblue} $8.30$ \\
    \bottomrule
    \end{tabular}%
    }
    \label{exp:safety_result_number_10}
\end{table}