ko-bench / ko_bench.csv
davidkim205's picture
add claud-3-5 results
174062d
raw
history blame
7.03 kB
judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
gpt-4o,1,openai__GPT-4o-2024-05-13,9.4,8.7,9.6,9.6,9.9,9.0,9.2,9.7,9.3
gpt-4o,1,Anthropic__claude-3-5-sonnet-20240620,9.0,6.7,9.5,9.2,9.6,9.3,8.7,9.8,9.0
gpt-4o,1,openai__gpt-4-0125-preview,8.9,7.7,9.8,9.1,9.7,7.8,9.2,8.7,9.4
gpt-4o,1,openai__GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
gpt-4o,1,Anthropic__claude-3-opus-20240229,8.6,8.1,9.7,9.3,8.7,5.8,8.2,9.4,9.5
gpt-4o,1,mistralai__Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
gpt-4o,1,Qwen__Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
gpt-4o,1,google__gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
gpt-4o,1,google__gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
gpt-4o,1,davidkim205__ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
gpt-4o,1,google__gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
gpt-4o,1,alpindale__WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
gpt-4o,1,openai__gpt-3.5-turbo-0125,6.7,5.2,9.0,7.7,6.4,3.3,7.2,6.5,8.6
gpt-4o,1,meta-llama__Meta-Llama-3.1-70B-Instruct,6.6,6.4,8.7,8.0,4.5,4.0,7.9,7.4,5.9
gpt-4o,1,Qwen__Qwen2-7B-Instruct,6.5,3.9,9.0,8.0,5.6,3.6,7.0,6.6,8.2
gpt-4o,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,4.9,7.4,7.1,7.3,5.1,6.4,4.1,7.6
gpt-4o,1,Qwen__Qwen1.5-32B-Chat,6.1,4.0,8.6,8.5,4.7,2.6,6.3,7.5,6.7
gpt-4o,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.8,3.5,5.0,8.5,5.4,3.2,5.4,7.5,7.6
gpt-4o,1,davidkim205__Ko-Llama-3-8B-Instruct,5.7,4.6,7.0,7.7,2.8,2.5,6.2,6.9,7.6
gpt-4o,1,meta-llama__Meta-Llama-3.1-8B-Instruct,5.4,4.6,7.4,6.3,5.2,3.3,5.2,5.4,6.0
gpt-4o,1,Qwen__Qwen1.5-14B-Chat,5.4,3.3,7.2,6.8,4.2,2.0,5.7,6.7,7.2
gpt-4o,1,WizardLMTeam__WizardLM-13B-V1.2,4.8,3.4,8.2,6.1,2.2,3.4,5.0,4.3,6.1
gpt-4o,1,mistralai__Mistral-7B-Instruct-v0.2,2.6,3.0,3.7,2.0,1.7,1.3,4.5,1.4,3.1
gpt-4o,2,openai__GPT-4o-2024-05-13,8.3,7.9,8.9,9.2,8.1,7.0,8.9,8.7,7.5
gpt-4o,2,openai__gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
gpt-4o,2,Anthropic__claude-3-5-sonnet-20240620,7.9,6.9,9.1,9.0,6.4,6.9,8.1,8.2,8.4
gpt-4o,2,openai__GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
gpt-4o,2,mistralai__Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
gpt-4o,2,google__gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
gpt-4o,2,google__gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
gpt-4o,2,Anthropic__claude-3-opus-20240229,6.9,6.0,9.0,7.3,6.2,5.8,7.3,6.5,7.5
gpt-4o,2,Qwen__Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
gpt-4o,2,davidkim205__ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
gpt-4o,2,alpindale__WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
gpt-4o,2,google__gemma-2-9b-it,6.2,4.8,7.6,8.3,4.9,3.9,7.0,7.4,6.1
gpt-4o,2,Qwen__Qwen1.5-32B-Chat,5.8,4.3,8.2,7.6,3.8,3.0,6.8,5.9,6.9
gpt-4o,2,meta-llama__Meta-Llama-3.1-70B-Instruct,5.7,5.5,8.0,7.4,3.6,2.9,6.6,5.7,5.7
gpt-4o,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,5.6,5.8,6.2,5.5,7.0,4.0,5.7,4.3,6.5
gpt-4o,2,openai__gpt-3.5-turbo-0125,5.4,5.8,5.7,7.2,4.4,3.0,6.6,4.4,6.4
gpt-4o,2,Qwen__Qwen2-7B-Instruct,5.3,5.0,7.0,6.6,5.1,2.7,5.6,4.8,5.9
gpt-4o,2,Qwen__Qwen1.5-14B-Chat,4.9,3.5,5.1,7.4,4.1,2.7,5.9,5.0,5.9
gpt-4o,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,4.5,3.3,3.8,7.6,4.9,2.1,5.6,5.7,3.3
gpt-4o,2,mistralai__Mistral-7B-Instruct-v0.2,4.5,3.9,4.4,6.8,2.2,2.4,6.2,5.6,4.6
gpt-4o,2,davidkim205__Ko-Llama-3-8B-Instruct,4.0,3.7,4.3,6.4,2.8,2.3,4.9,4.0,4.1
gpt-4o,2,meta-llama__Meta-Llama-3.1-8B-Instruct,3.9,4.1,5.0,4.8,3.8,2.1,4.0,3.5,3.6
gpt-4o,2,WizardLMTeam__WizardLM-13B-V1.2,3.0,2.6,3.5,3.6,1.8,2.3,3.7,3.3,2.8
keval,1,openai__GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
keval,1,Anthropic__claude-3-5-sonnet-20240620,9.0,7.2,9.8,9.2,9.3,9.2,8.9,9.4,9.0
keval,1,openai__gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
keval,1,openai__GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
keval,1,Anthropic__claude-3-opus-20240229,8.4,8.1,9.8,8.7,8.3,5.8,7.9,9.2,9.0
keval,1,mistralai__Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
keval,1,google__gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
keval,1,google__gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
keval,1,Qwen__Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
keval,1,davidkim205__ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
keval,1,google__gemma-2-9b-it,7.6,6.7,8.8,8.5,5.2,5.5,9.0,8.6,8.5
keval,1,meta-llama__Meta-Llama-3.1-70B-Instruct,7.3,6.8,9.0,8.3,5.9,5.1,8.4,8.0,7.1
keval,1,Qwen__Qwen1.5-14B-Chat,7.2,4.7,9.7,8.8,4.5,4.8,8.1,8.9,8.4
keval,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,7.2,5.7,8.8,8.1,8.2,6.0,7.7,5.6,7.3
keval,1,alpindale__WizardLM-2-8x22B,7.1,6.1,5.6,7.9,8.8,5.9,6.5,8.7,7.1
keval,1,Qwen__Qwen1.5-32B-Chat,7.0,3.9,9.9,8.9,5.8,3.6,7.1,8.6,7.9
keval,1,openai__gpt-3.5-turbo-0125,6.9,5.6,8.9,7.7,6.4,3.2,7.4,7.5,8.6
keval,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,6.8,3.4,8.6,8.5,5.5,4.1,6.9,8.8,8.4
keval,1,Qwen__Qwen2-7B-Instruct,6.4,3.6,9.0,7.7,5.5,3.5,7.1,6.7,8.4
keval,1,meta-llama__Meta-Llama-3.1-8B-Instruct,6.3,4.3,8.9,7.7,5.3,3.3,7.3,6.0,7.5
keval,1,davidkim205__Ko-Llama-3-8B-Instruct,6.0,5.0,7.4,7.6,2.9,2.9,7.0,8.0,7.6
keval,1,WizardLMTeam__WizardLM-13B-V1.2,6.0,3.7,9.3,7.7,2.4,3.8,7.0,6.6,7.7
keval,1,mistralai__Mistral-7B-Instruct-v0.2,3.0,3.0,6.7,3.0,2.0,2.0,3.3,1.9,2.4
keval,2,openai__GPT-4o-2024-05-13,8.1,7.7,8.9,9.2,7.8,6.9,8.4,8.7,7.4
keval,2,openai__gpt-4-0125-preview,7.7,6.3,8.4,8.8,6.9,6.3,8.6,8.6,8.0
keval,2,openai__GPT-4o-mini-2024-07-18,7.4,6.8,7.6,8.7,7.7,4.3,7.8,8.4,7.8
keval,2,Anthropic__claude-3-5-sonnet-20240620,7.3,6.6,7.6,9.0,6.6,5.7,7.6,8.1,7.1
keval,2,mistralai__Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
keval,2,Qwen__Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
keval,2,google__gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
keval,2,Anthropic__claude-3-opus-20240229,6.8,6.2,8.4,7.8,5.4,5.1,7.0,7.3,7.5
keval,2,alpindale__WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
keval,2,google__gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
keval,2,davidkim205__ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
keval,2,google__gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
keval,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
keval,2,Qwen__Qwen1.5-32B-Chat,6.2,5.2,7.7,8.0,4.1,4.0,7.7,6.7,6.5
keval,2,Qwen__Qwen1.5-14B-Chat,6.0,4.7,6.9,7.9,4.8,3.8,7.2,6.3,6.7
keval,2,meta-llama__Meta-Llama-3.1-70B-Instruct,6.0,6.0,7.3,7.6,5.6,2.9,7.0,6.2,5.6
keval,2,Qwen__Qwen2-7B-Instruct,5.6,4.9,7.0,6.5,5.1,3.1,6.3,5.0,6.5
keval,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.5,4.6,4.9,6.7,5.9,3.2,6.9,6.8,5.2
keval,2,openai__gpt-3.5-turbo-0125,5.3,6.2,5.5,7.0,4.5,3.3,6.2,4.5,5.4
keval,2,meta-llama__Meta-Llama-3.1-8B-Instruct,4.8,5.0,6.0,5.5,4.4,2.6,5.9,5.0,4.4
keval,2,davidkim205__Ko-Llama-3-8B-Instruct,4.2,3.6,4.6,6.3,2.8,2.2,6.1,3.7,4.3
keval,2,WizardLMTeam__WizardLM-13B-V1.2,4.1,3.7,5.4,5.8,2.8,3.0,5.6,3.3,3.4
keval,2,mistralai__Mistral-7B-Instruct-v0.2,4.1,3.5,6.1,6.3,2.6,2.2,3.5,3.2,5.5