lumos_data_demo / data_dir /elo_ranks.length_ablation.all.jsonl
DaYin's picture
Upload 69 files
e90fa51 verified
raw
history blame contribute delete
No virus
7.86 kB
{"model name ": "gpt-4-0125-preview", "elo overall": 1228, "Information seeking": 1274, "Creative Writing": 1202, "Coding & Debugging": 1224, "Reasoning": 1285, "Editing": 1132, "Math": 1202, "Planning": 1198, "Brainstorming": 1181, "Role playing": 1225, "Advice seeking": 1274, "Data Analysis": 1198, "Others": 1033, "average": 1202.3333333333333, "# battles": 5430}
{"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1112, "Information seeking": 1102, "Creative Writing": 1136, "Coding & Debugging": 1119, "Reasoning": 1071, "Editing": 1080, "Math": 1094, "Planning": 1057, "Brainstorming": 1038, "Role playing": 1126, "Advice seeking": 1050, "Data Analysis": 1083, "Others": 1010, "average": 1080.5, "# battles": 2371}
{"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1112, "Information seeking": 1206, "Creative Writing": 1171, "Coding & Debugging": 938, "Reasoning": 1091, "Editing": 1057, "Math": 993, "Planning": 1105, "Brainstorming": 1136, "Role playing": 1128, "Advice seeking": 1110, "Data Analysis": 965, "Others": 1017, "average": 1076.4166666666667, "# battles": 1942}
{"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1107, "Information seeking": 1184, "Creative Writing": 1146, "Coding & Debugging": 937, "Reasoning": 1095, "Editing": 1017, "Math": 993, "Planning": 1113, "Brainstorming": 1140, "Role playing": 1141, "Advice seeking": 1136, "Data Analysis": 996, "Others": 1008, "average": 1075.5, "# battles": 1898}
{"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1096, "Information seeking": 1182, "Creative Writing": 1135, "Coding & Debugging": 924, "Reasoning": 1075, "Editing": 1038, "Math": 1004, "Planning": 1093, "Brainstorming": 1141, "Role playing": 1103, "Advice seeking": 1116, "Data Analysis": 969, "Others": 1032, "average": 1067.6666666666667, "# battles": 1888}
{"model name ": "zephyr-7b-beta", "elo overall": 1079, "Information seeking": 1027, "Creative Writing": 1094, "Coding & Debugging": 1134, "Reasoning": 1047, "Editing": 1091, "Math": 1051, "Planning": 1064, "Brainstorming": 1075, "Role playing": 1038, "Advice seeking": 985, "Data Analysis": 1092, "Others": 1017, "average": 1059.5833333333333, "# battles": 3367}
{"model name ": "Yi-34B-Chat", "elo overall": 1047, "Information seeking": 1082, "Creative Writing": 1068, "Coding & Debugging": 952, "Reasoning": 1086, "Editing": 987, "Math": 1064, "Planning": 1090, "Brainstorming": 1119, "Role playing": 1055, "Advice seeking": 1117, "Data Analysis": 970, "Others": 1004, "average": 1049.5, "# battles": 2428}
{"model name ": "tulu-2-dpo-70b", "elo overall": 1037, "Information seeking": 1012, "Creative Writing": 1068, "Coding & Debugging": 1040, "Reasoning": 1023, "Editing": 1066, "Math": 1044, "Planning": 1027, "Brainstorming": 1007, "Role playing": 1060, "Advice seeking": 1016, "Data Analysis": 1030, "Others": 1026, "average": 1034.9166666666667, "# battles": 3435}
{"model name ": "claude-3-sonnet-20240229", "elo overall": 1014, "Information seeking": 979, "Creative Writing": 944, "Coding & Debugging": 1162, "Reasoning": 1056, "Editing": 1056, "Math": 1076, "Planning": 1037, "Brainstorming": 953, "Role playing": 899, "Advice seeking": 985, "Data Analysis": 1080, "Others": 988, "average": 1017.9166666666666, "# battles": 2608}
{"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1011, "Information seeking": 1002, "Creative Writing": 1001, "Coding & Debugging": 1055, "Reasoning": 1004, "Editing": 1001, "Math": 1008, "Planning": 975, "Brainstorming": 967, "Role playing": 1029, "Advice seeking": 965, "Data Analysis": 1028, "Others": 1021, "average": 1004.6666666666666, "# battles": 3418}
{"model name ": "claude-3-opus-20240229", "elo overall": 1008, "Information seeking": 965, "Creative Writing": 941, "Coding & Debugging": 1191, "Reasoning": 1044, "Editing": 1082, "Math": 1045, "Planning": 1028, "Brainstorming": 945, "Role playing": 886, "Advice seeking": 1014, "Data Analysis": 1089, "Others": 999, "average": 1019.0833333333334, "# battles": 2367}
{"model name ": "command", "elo overall": 1000, "Information seeking": 969, "Creative Writing": 982, "Coding & Debugging": 1071, "Reasoning": 996, "Editing": 1024, "Math": 1000, "Planning": 1059, "Brainstorming": 1033, "Role playing": 951, "Advice seeking": 978, "Data Analysis": 1018, "Others": 994, "average": 1006.25, "# battles": 1733}
{"model name ": "mistral-large-2402", "elo overall": 986, "Information seeking": 944, "Creative Writing": 993, "Coding & Debugging": 1011, "Reasoning": 978, "Editing": 993, "Math": 996, "Planning": 930, "Brainstorming": 987, "Role playing": 985, "Advice seeking": 951, "Data Analysis": 1015, "Others": 1004, "average": 982.25, "# battles": 1881}
{"model name ": "Llama-2-70b-chat-hf", "elo overall": 955, "Information seeking": 978, "Creative Writing": 968, "Coding & Debugging": 861, "Reasoning": 988, "Editing": 987, "Math": 950, "Planning": 994, "Brainstorming": 1017, "Role playing": 974, "Advice seeking": 1034, "Data Analysis": 923, "Others": 1025, "average": 974.9166666666666, "# battles": 2391}
{"model name ": "gemini-1.0-pro", "elo overall": 950, "Information seeking": 940, "Creative Writing": 942, "Coding & Debugging": 998, "Reasoning": 964, "Editing": 958, "Math": 972, "Planning": 912, "Brainstorming": 946, "Role playing": 983, "Advice seeking": 923, "Data Analysis": 1004, "Others": 998, "average": 961.6666666666666, "# battles": 1816}
{"model name ": "Llama-2-13b-chat-hf", "elo overall": 934, "Information seeking": 956, "Creative Writing": 959, "Coding & Debugging": 838, "Reasoning": 940, "Editing": 983, "Math": 991, "Planning": 967, "Brainstorming": 1001, "Role playing": 978, "Advice seeking": 1013, "Data Analysis": 926, "Others": 1001, "average": 962.75, "# battles": 2350}
{"model name ": "Llama-2-7b-chat-hf", "elo overall": 924, "Information seeking": 943, "Creative Writing": 951, "Coding & Debugging": 817, "Reasoning": 943, "Editing": 961, "Math": 938, "Planning": 979, "Brainstorming": 982, "Role playing": 953, "Advice seeking": 985, "Data Analysis": 913, "Others": 1007, "average": 947.6666666666666, "# battles": 2297}
{"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 917, "Information seeking": 867, "Creative Writing": 917, "Coding & Debugging": 1006, "Reasoning": 894, "Editing": 963, "Math": 969, "Planning": 883, "Brainstorming": 870, "Role playing": 958, "Advice seeking": 857, "Data Analysis": 1011, "Others": 988, "average": 931.9166666666666, "# battles": 2480}
{"model name ": "gemma-7b-it", "elo overall": 915, "Information seeking": 914, "Creative Writing": 864, "Coding & Debugging": 1028, "Reasoning": 894, "Editing": 880, "Math": 939, "Planning": 887, "Brainstorming": 875, "Role playing": 954, "Advice seeking": 906, "Data Analysis": 988, "Others": 1002, "average": 927.5833333333334, "# battles": 2519}
{"model name ": "gemma-2b-it", "elo overall": 884, "Information seeking": 885, "Creative Writing": 890, "Coding & Debugging": 925, "Reasoning": 837, "Editing": 895, "Math": 836, "Planning": 923, "Brainstorming": 902, "Role playing": 910, "Advice seeking": 897, "Data Analysis": 911, "Others": 987, "average": 899.8333333333334, "# battles": 2484}
{"model name ": "vicuna-13b-v1.5", "elo overall": 846, "Information seeking": 837, "Creative Writing": 837, "Coding & Debugging": 830, "Reasoning": 851, "Editing": 874, "Math": 931, "Planning": 852, "Brainstorming": 879, "Role playing": 891, "Advice seeking": 883, "Data Analysis": 882, "Others": 993, "average": 878.3333333333334, "# battles": 2478}
{"model name ": "gpt-3.5-turbo-0125", "elo overall": 822, "Information seeking": 754, "Creative Writing": 789, "Coding & Debugging": 955, "Reasoning": 844, "Editing": 880, "Math": 902, "Planning": 826, "Brainstorming": 815, "Role playing": 777, "Advice seeking": 803, "Data Analysis": 918, "Others": 844, "average": 842.25, "# battles": 14003}