,model,score,scenario,source,aggragated_from 0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[] 1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[] 2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[] 3,yi-large,63.7,arena-hard,arena_hard_2404,[] 4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[] 5,glm-4,55.7,arena-hard,arena_hard_2404,[] 6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[] 7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[] 8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[] 9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[] 10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[] 11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[] 12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[] 13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[] 14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[] 15,command-r-plus,33.1,arena-hard,arena_hard_2404,[] 16,mistral-medium,31.9,arena-hard,arena_hard_2404,[] 17,mistral-next,27.4,arena-hard,arena_hard_2404,[] 18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[] 19,claude-2.0,24.0,arena-hard,arena_hard_2404,[] 20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[] 21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[] 22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[] 23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[] 24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[] 25,claude-2.1,22.8,arena-hard,arena_hard_2404,[] 26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[] 27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[] 28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[] 29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[] 30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[] 31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[] 32,command-r,17.0,arena-hard,arena_hard_2404,[] 33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[] 34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[] 35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[] 36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[] 37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[] 38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[] 39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[] 40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[] 41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[] 42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[] 43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[] 0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" 264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[] 265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[] 266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[] 267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[] 268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[] 269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[] 270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[] 271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[] 272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[] 273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[] 274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[] 275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[] 276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[] 277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[] 278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[] 279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[] 280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[] 281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[] 282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[] 283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[] 284,command-r,75.0,mmlu-mixed,mixeval_240601,[] 285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[] 286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[] 287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[] 288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[] 289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[] 290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[] 291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[] 292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[] 293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[] 294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[] 295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[] 296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[] 297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[] 298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[] 299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[] 300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[] 301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[] 302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[] 303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[] 304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[] 305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[] 306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[] 307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[] 308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[] 309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[] 310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[] 311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[] 312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[] 313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[] 314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[] 315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[] 316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[] 317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[] 318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[] 319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[] 320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[] 321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[] 322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[] 323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[] 324,phi-2,62.5,mmlu-mixed,mixeval_240601,[] 325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[] 326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[] 327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[] 328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[] 329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[] 594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[] 595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[] 596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[] 597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[] 598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[] 599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[] 600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[] 601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[] 602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[] 603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[] 604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[] 605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[] 606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[] 607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[] 608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[] 609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[] 610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[] 611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[] 612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[] 613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[] 614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[] 615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[] 616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[] 617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[] 618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[] 619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[] 620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[] 621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[] 622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[] 623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[] 624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[] 625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[] 626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[] 627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[] 628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[] 629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[] 630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[] 631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[] 632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[] 633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[] 634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[] 635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[] 636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[] 637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[] 638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[] 639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[] 640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[] 641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[] 642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[] 643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[] 644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[] 645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[] 646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[] 647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[] 648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[] 649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[] 650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[] 651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[] 652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[] 653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[] 654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[] 655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[] 656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[] 657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[] 658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[] 659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[] 593,gpt-4-0314,0.57,agieval,BLZ_240312,[] 594,gpt-4-0613,0.57,agieval,BLZ_240312,[] 596,claude-1,0.49700000000000005,agieval,BLZ_240312,[] 601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[] 602,yi-34b-chat,0.508,agieval,BLZ_240312,[] 605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[] 608,vicuna-33b,0.373,agieval,BLZ_240312,[] 609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[] 611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[] 613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[] 614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[] 617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[] 618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[] 620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[] 623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[] 624,vicuna-13b,0.368,agieval,BLZ_240312,[] 626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[] 627,qwen-14b-chat,0.396,agieval,BLZ_240312,[] 630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[] 632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[] 634,vicuna-7b,0.314,agieval,BLZ_240312,[] 636,chatglm3-6b,0.414,agieval,BLZ_240312,[] 643,chatglm-6b,0.325,agieval,BLZ_240312,[] 647,llama-13b,0.205,agieval,BLZ_240312,[] 886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[] 888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[] 889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[] 890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[] 891,claude-1,0.8839,alpacav1,BLZ_240312,[] 892,claude-2.0,0.9136,alpacav1,BLZ_240312,[] 893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[] 894,claude-2.1,0.8708,alpacav1,BLZ_240312,[] 895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[] 896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[] 897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[] 898,gemini-pro,0.7966,alpacav1,BLZ_240312,[] 900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[] 902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[] 903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[] 904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[] 906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[] 909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[] 911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[] 914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[] 915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[] 918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[] 921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[] 924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[] 925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[] 934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[] 937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[] 827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[] 829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[] 830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[] 831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[] 832,claude-1,0.17,alpacav2,BLZ_240312,[] 833,claude-2.0,0.172,alpacav2,BLZ_240312,[] 834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[] 835,claude-2.1,0.157,alpacav2,BLZ_240312,[] 836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[] 837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[] 838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[] 839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[] 840,claude-instant-1,0.161,alpacav2,BLZ_240312,[] 841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[] 842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[] 843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[] 844,vicuna-33b,0.127,alpacav2,BLZ_240312,[] 845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[] 846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[] 847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[] 849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[] 852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[] 854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[] 855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[] 856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[] 859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[] 860,vicuna-13b,0.067,alpacav2,BLZ_240312,[] 862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[] 863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[] 865,guanaco-33b,0.05,alpacav2,BLZ_240312,[] 866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[] 870,vicuna-7b,0.048,alpacav2,BLZ_240312,[] 875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[] 878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[] 1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[] 1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[] 1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[] 1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[] 1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[] 1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[] 1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[] 1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[] 1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[] 1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[] 1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[] 1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[] 1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[] 1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[] 1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[] 1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[] 1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[] 1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[] 1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[] 1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[] 1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[] 1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[] 1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[] 1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[] 1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[] 1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[] 1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[] 1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[] 1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[] 1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[] 0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[] 1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[] 2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[] 3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[] 4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[] 5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[] 6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[] 7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[] 8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[] 9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[] 10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[] 11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[] 12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[] 13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[] 14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[] 15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[] 16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[] 17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[] 18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[] 19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[] 20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[] 21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[] 22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[] 23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[] 24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[] 25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[] 26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[] 27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[] 28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[] 29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[] 30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[] 31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[] 32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[] 33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[] 34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[] 35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[] 36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[] 37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[] 38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[] 39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[] 40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[] 41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[] 42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[] 43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[] 44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[] 45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[] 46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[] 47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[] 48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[] 49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[] 50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[] 51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[] 52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[] 53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[] 54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[] 55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[] 56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[] 57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[] 542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[] 543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[] 550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[] 554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[] 555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[] 558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[] 559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[] 561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[] 565,vicuna-13b,0.631,gpt4all,BLZ_240312,[] 567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[] 573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[] 575,vicuna-7b,0.61,gpt4all,BLZ_240312,[] 576,koala-13b,0.62,gpt4all,BLZ_240312,[] 578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[] 579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[] 583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[] 585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[] 586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[] 588,llama-13b,0.63,gpt4all,BLZ_240312,[] 129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[] 130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[] 134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[] 135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[] 136,vicuna-33b,0.585,hugging-6,BLZ_240312,[] 137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[] 139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[] 141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[] 142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[] 145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[] 146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[] 147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[] 148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[] 149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[] 150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[] 151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[] 152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[] 154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[] 156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[] 158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[] 160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[] 162,vicuna-7b,0.521,hugging-6,BLZ_240312,[] 176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[] 947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[] 948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[] 950,claude-1,0.66,llmonitor,BLZ_240312,[] 951,claude-2.0,0.68,llmonitor,BLZ_240312,[] 954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[] 958,claude-instant-1,0.6,llmonitor,BLZ_240312,[] 959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[] 965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[] 975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[] 976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[] 977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[] 978,vicuna-13b,0.5,llmonitor,BLZ_240312,[] 982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[] 983,guanaco-33b,0.43,llmonitor,BLZ_240312,[] 984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[] 986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[] 987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[] 988,vicuna-7b,0.41,llmonitor,BLZ_240312,[] 989,koala-13b,0.31,llmonitor,BLZ_240312,[] 992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[] 1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[] 59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[] 60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[] 62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[] 63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[] 64,mistral-medium,0.0861,mt-bench,BLZ_240312,[] 65,claude-1,0.079,mt-bench,BLZ_240312,[] 66,claude-2.0,0.0806,mt-bench,BLZ_240312,[] 67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[] 68,claude-2.1,0.0818,mt-bench,BLZ_240312,[] 69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[] 70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[] 71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[] 72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[] 73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[] 74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[] 75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[] 76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[] 77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[] 78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[] 79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[] 80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[] 81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[] 82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[] 83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[] 84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[] 85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[] 86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[] 88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[] 89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[] 90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[] 92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[] 93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[] 95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[] 96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[] 98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[] 99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[] 101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[] 102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[] 103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[] 104,koala-13b,0.0535,mt-bench,BLZ_240312,[] 106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[] 107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[] 108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[] 109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[] 110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[] 111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[] 112,chatglm-6b,0.045,mt-bench,BLZ_240312,[] 113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[] 114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[] 115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[] 116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[] 0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" 0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" 67,llama-2-70b,0.582,mmlu,helm_classic_240130,[] 68,llama-65b,0.584,mmlu,helm_classic_240130,[] 69,text-davinci-002,0.568,mmlu,helm_classic_240130,[] 70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[] 71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[] 72,text-davinci-003,0.569,mmlu,helm_classic_240130,[] 73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[] 74,llama-2-13b,0.507,mmlu,helm_classic_240130,[] 75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[] 76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[] 77,llama-30b,0.531,mmlu,helm_classic_240130,[] 78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[] 79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[] 80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[] 81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[] 82,falcon-40b,0.509,mmlu,helm_classic_240130,[] 83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[] 84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[] 85,mpt-30b,0.437,mmlu,helm_classic_240130,[] 86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[] 87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[] 88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[] 89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[] 90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[] 91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[] 92,opt-175b,0.318,mmlu,helm_classic_240130,[] 93,llama-2-7b,0.431,mmlu,helm_classic_240130,[] 94,llama-13b,0.422,mmlu,helm_classic_240130,[] 95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[] 96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[] 97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[] 98,davinci-175b,0.422,mmlu,helm_classic_240130,[] 99,llama-7b,0.321,mmlu,helm_classic_240130,[] 100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[] 101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[] 102,glm-130b,0.344,mmlu,helm_classic_240130,[] 103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[] 104,opt-66b,0.276,mmlu,helm_classic_240130,[] 105,bloom-176b,0.299,mmlu,helm_classic_240130,[] 106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[] 107,alpaca-7b,0.385,mmlu,helm_classic_240130,[] 108,falcon-7b,0.286,mmlu,helm_classic_240130,[] 109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[] 110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[] 111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[] 112,text-curie-001,0.237,mmlu,helm_classic_240130,[] 113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[] 114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[] 115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[] 116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[] 117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[] 118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[] 119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[] 120,pythia-12b,0.274,mmlu,helm_classic_240130,[] 121,curie-6.7b,0.243,mmlu,helm_classic_240130,[] 122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[] 123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[] 124,text-babbage-001,0.229,mmlu,helm_classic_240130,[] 125,t0pp-11b,0.407,mmlu,helm_classic_240130,[] 126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[] 127,ul2-20b,0.291,mmlu,helm_classic_240130,[] 128,t5-11b,0.29,mmlu,helm_classic_240130,[] 129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[] 130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[] 131,ada-350m,0.243,mmlu,helm_classic_240130,[] 132,text-ada-001,0.238,mmlu,helm_classic_240130,[] 133,yalm-100b,0.243,mmlu,helm_classic_240130,[] 0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[] 1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[] 2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[] 3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[] 4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[] 5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[] 6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[] 7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[] 8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[] 9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[] 10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[] 11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[] 12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[] 13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[] 14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[] 15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[] 16,command-r,-16.0,wildbench-mix,wildbench_240612,[] 17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[] 18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[] 19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[] 20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[] 21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[] 22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[] 23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[] 24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[] 25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[] 26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[] 27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[] 28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[] 13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[] 30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[] 41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[] 50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[] 60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[] 70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[] 81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[] 92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[] 103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[] 112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[] 121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[] 132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[] 143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[] 153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[] 162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[] 172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[] 182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[] 192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[]