yichao's picture
update mj-bench
b650828
raw
history blame
2.47 kB
\begin{table}[t]
\centering
\caption{The detailed evaluation result of all multimodal judges on \textbf{bias} perspective. The feedback are provided in different scales including numerical scales ([0-5], and [0-10]) and Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. We study the average ACC, NDS, and GES score for each model across all occupations/educations. The best performance across all models is bolded.}
\resizebox{1.0\linewidth}{!}{%
\begin{tabular}{c|ccc|ccc|ccc}
\toprule
& \multicolumn{3}{c}{\bf Numerical [0-5]} & \multicolumn{3}{c}{\bf Numerical [0-10]} & \multicolumn{3}{c}{\bf Likert scale}\\
& ACC & NDS & GES & ACC & NDS & GES & ACC & NDS & GES \\
\midrule
LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & 64.6 & 87.7 & 47.1 & 77.3 & 90.1 & \bf 81.5 & 82.4 & \bf 94.2 \\
LLaVA-1.5-13b$^\heartsuit$ & 55.5 & 77.5 & 90.0 & 37.8 & 78.7 & 89.4 & 61.2 & 78.4 & 91.0 \\
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 72.1 & 71.2 & 88.3 & 58.6 & 65.4 & 84.1 & 59.1 & 68.3 & 86.1 \\
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 49.3 & 68.1 & 85.2 & 42.6 & 69.6 & 84.9 & 53.5 & 73.1 & 87.6\\
Instructblip-7b$^\heartsuit$ & 58.7 & \bf 85.3 & 91.5 & 53.6 & 80.6 & 91.1 & 71.5 & 84.5 & 94.3 \\
MiniGPT4-v2$^\heartsuit$ & 35.6 & 69.2 & 79.5 & 32.6 & 67.0 & 83.3 & 38.5 & 39.3 & 68.9 \\
Prometheus-Vision-7b$^\heartsuit$ & 49.5 & 43.4 & 74.4 & 52.1 & 37.9 & 73.0 & 47.4 & 25.3 & 64.6 \\
Prometheus-Vision-13b$^\heartsuit$ & 66.3 & 46.3 & 76.8 & \bf 68.2 & 23.3 & 69.4 & 67.6 & 47.4 & 77.6 \\
Qwen-VL-Chat$^\spadesuit$ & 71.8 & 76.3 & 91.3 & 30.1 & 70.6 & 85.7 & 45.9 & 74.9 & 88.0 \\
Internvl-chat-v1-5$^\spadesuit$ & 41.0 & 74.1 & 87.2 & 25.4 & 69.6 & 84.3 & 59.2 & 83.6 & 92.6\\
Idefics2-8b$^\spadesuit$ & 41.9 & 68.7 & 84.4 & 42.1 & 66.7 & 83.4 & 61.6 & \bf 86.5 & 93.9 \\
\midrule
GPT-4-vision$^\clubsuit$ & 79.1 & 80.2 & \bf 93.2 & 41.5 & \bf 86.4 & \bf 93.7 & 58.7 & 69.8 & 87.1 \\
GPT-4o$^\clubsuit$ & 66.6 & 82.7 & 92.9 & 26.2 & 74.2 & 86.5 & 74.3 & 79.2 & 92.2 \\
Gemini Ultra$^\clubsuit$ & 56.9 & 75.8 & 89.0 & 36.2 & 72.4 & 85.6 & 74.5 & 78.4 & 91.6 \\
Claude 3 Opus$^\clubsuit$ & 58.2 & 66.1 & 85.2 & 52.1 & 59.5 & 82.1 & 57.4 & 83.6 & 92.5 \\
\bottomrule
\end{tabular}%
}
\label{exp:bias_scale}
\end{table}