Spaces:
Running
Running
\begin{table}[t] | |
\centering | |
\caption{Human evaluation result on the generated images from six fine-tuned SD-v1.5 model using the feedback from six multimodal judges, i.e. GPT-4o, GPT-4-vision, Gemini Ultra, Claude 3 Opus, Internvl-chat-v1-5, and HPS-v2.1. Specifically, we consider the following four metrics: ranking over fixed seed (\textbf{FR}), ranking over random seed (\textbf{RR}), average ranking (\textbf{AR}), and average voting (\textbf{AV}). The best performance across all models are bolded.} | |
\setlength{\tabcolsep}{2pt} | |
\renewcommand{\arraystretch}{0.9} | |
\resizebox{1.0\linewidth}{!}{% | |
\begin{tabular}{l|cccc|cccc|cccc} | |
\toprule | |
& \multicolumn{4}{c}{\bf Alignment} & \multicolumn{4}{c}{\bf Safety} & \multicolumn{4}{c}{\bf Bias} \\ | |
& FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} \\ | |
\midrule | |
GPT-4o$^\clubsuit$ & \bf 2.16 & \bf 2.66 & \cellcolor{skyblue}{\bf 2.50} & \cellcolor{skyblue}{\bf 17.21\%} & 1.91 & \bf 1.88 & \cellcolor{skyblue}{\bf 1.89} & \cellcolor{skyblue}{\bf 17.37\%} & \bf 1.72 & \bf 2.48 & \cellcolor{skyblue}{\bf 2.10} & \cellcolor{skyblue}{\bf 21.58\%} \\ | |
GPT-4-vision$^\clubsuit$ & 2.43 & 2.81 & \cellcolor{skyblue}{2.68} & \cellcolor{skyblue}{15.96\%} & \bf 1.84 & 1.98 & \cellcolor{skyblue}{1.94} & \cellcolor{skyblue}{16.81\%} & 1.99 & 3.14 & \cellcolor{skyblue}{2.57} & \cellcolor{skyblue}{16.80\%} \\ | |
Gemini Ultra$^\clubsuit$ & \bf 2.15 & 2.72 & \cellcolor{skyblue}{2.54} & \cellcolor{skyblue}{14.87\%} & \bf 1.55 & \bf 1.69 & \cellcolor{skyblue}{\bf 1.64} & \cellcolor{skyblue}{\bf 18.98\%} & 2.23 & \bf 2.65 & \cellcolor{skyblue}{2.44} & \cellcolor{skyblue}{16.18\%} \\ | |
Claude 3 Opus$^\clubsuit$ & 2.25 & 2.80 & \cellcolor{skyblue}{2.62} & \cellcolor{skyblue}{15.34\%} & 2.07 & 2.12 & \cellcolor{skyblue}{2.10} & \cellcolor{skyblue}{16.15\%} & 2.29 & 3.43 & \cellcolor{skyblue}{2.86} & \cellcolor{skyblue}{11.62\%} \\ | |
Internvl-chat-v1-5$^\spadesuit$ & 3.16 & 2.99 & \cellcolor{skyblue}{3.05} & \cellcolor{skyblue}{16.90\%} & 2.49 & 2.28 & \cellcolor{skyblue}{2.35} & \cellcolor{skyblue}{15.30\%} & 1.97 & 3.43 & \cellcolor{skyblue}{2.70} & \cellcolor{skyblue}{14.52\%} \\ | |
HPS-v2.1$^\diamondsuit$ & 2.21 & \bf 2.42 & \cellcolor{skyblue}{\bf 2.35} & \cellcolor{skyblue}{\bf 19.72\%} & 2.42 & 2.37 & \cellcolor{skyblue}{2.39} & \cellcolor{skyblue}{15.39\%} & \bf 1.78 & \bf 2.65 & \cellcolor{skyblue}{\bf 2.21} & \cellcolor{skyblue}{\bf 19.29\%} \\ | |
\bottomrule | |
\end{tabular}% | |
} | |
\label{exp:human_eval} | |
\end{table} |