migtissera commited on
Commit
9099977
1 Parent(s): 564e6fe

Upload 3 files

Browse files
Evals/Tess-v2.5-Qwen2-72B-agieval_nous.json ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "agieval_nous": {
4
+ "acc_norm,none": 0.5357423409269442,
5
+ "acc_norm_stderr,none": 0.00952110343229611,
6
+ "acc,none": 0.6233307148468186,
7
+ "acc_stderr,none": 0.009151821236525831,
8
+ "alias": "agieval_nous"
9
+ },
10
+ "agieval_aqua_rat": {
11
+ "acc,none": 0.5511811023622047,
12
+ "acc_stderr,none": 0.03126961011656295,
13
+ "acc_norm,none": 0.5118110236220472,
14
+ "acc_norm_stderr,none": 0.031425959141896394,
15
+ "alias": " - agieval_aqua_rat"
16
+ },
17
+ "agieval_logiqa_en": {
18
+ "acc,none": 0.554531490015361,
19
+ "acc_stderr,none": 0.019494627133439985,
20
+ "acc_norm,none": 0.46236559139784944,
21
+ "acc_norm_stderr,none": 0.019555980839597826,
22
+ "alias": " - agieval_logiqa_en"
23
+ },
24
+ "agieval_lsat_ar": {
25
+ "acc,none": 0.26956521739130435,
26
+ "acc_stderr,none": 0.02932276422894952,
27
+ "acc_norm,none": 0.2565217391304348,
28
+ "acc_norm_stderr,none": 0.028858814315305643,
29
+ "alias": " - agieval_lsat_ar"
30
+ },
31
+ "agieval_lsat_lr": {
32
+ "acc,none": 0.7,
33
+ "acc_stderr,none": 0.020311909655921973,
34
+ "acc_norm,none": 0.5764705882352941,
35
+ "acc_norm_stderr,none": 0.021901379648792133,
36
+ "alias": " - agieval_lsat_lr"
37
+ },
38
+ "agieval_lsat_rc": {
39
+ "acc,none": 0.7881040892193308,
40
+ "acc_stderr,none": 0.02496236224822418,
41
+ "acc_norm,none": 0.6765799256505576,
42
+ "acc_norm_stderr,none": 0.028574302844503813,
43
+ "alias": " - agieval_lsat_rc"
44
+ },
45
+ "agieval_sat_en": {
46
+ "acc,none": 0.8689320388349514,
47
+ "acc_stderr,none": 0.02357025313368066,
48
+ "acc_norm,none": 0.8446601941747572,
49
+ "acc_norm_stderr,none": 0.02529912276040303,
50
+ "alias": " - agieval_sat_en"
51
+ },
52
+ "agieval_sat_en_without_passage": {
53
+ "acc,none": 0.616504854368932,
54
+ "acc_stderr,none": 0.033960279445866416,
55
+ "acc_norm,none": 0.5194174757281553,
56
+ "acc_norm_stderr,none": 0.03489517135066013,
57
+ "alias": " - agieval_sat_en_without_passage"
58
+ },
59
+ "agieval_sat_math": {
60
+ "acc,none": 0.6772727272727272,
61
+ "acc_stderr,none": 0.03159203270502094,
62
+ "acc_norm,none": 0.5318181818181819,
63
+ "acc_norm_stderr,none": 0.03371838809107287,
64
+ "alias": " - agieval_sat_math"
65
+ }
66
+ },
67
+ "groups": {
68
+ "agieval_nous": {
69
+ "acc_norm,none": 0.5357423409269442,
70
+ "acc_norm_stderr,none": 0.00952110343229611,
71
+ "acc,none": 0.6233307148468186,
72
+ "acc_stderr,none": 0.009151821236525831,
73
+ "alias": "agieval_nous"
74
+ }
75
+ },
76
+ "group_subtasks": {
77
+ "agieval_nous": [
78
+ "agieval_sat_en",
79
+ "agieval_lsat_ar",
80
+ "agieval_sat_en_without_passage",
81
+ "agieval_aqua_rat",
82
+ "agieval_logiqa_en",
83
+ "agieval_sat_math",
84
+ "agieval_lsat_rc",
85
+ "agieval_lsat_lr"
86
+ ]
87
+ },
88
+ "configs": {
89
+ "agieval_aqua_rat": {
90
+ "task": "agieval_aqua_rat",
91
+ "group": [
92
+ "agieval",
93
+ "agieval_en",
94
+ "agieval_nous"
95
+ ],
96
+ "dataset_path": "hails/agieval-aqua-rat",
97
+ "test_split": "test",
98
+ "doc_to_text": "{{query}}",
99
+ "doc_to_target": "{{gold}}",
100
+ "doc_to_choice": "{{choices}}",
101
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
102
+ "description": "",
103
+ "target_delimiter": " ",
104
+ "fewshot_delimiter": "\n\n",
105
+ "num_fewshot": 0,
106
+ "metric_list": [
107
+ {
108
+ "metric": "acc",
109
+ "aggregation": "mean",
110
+ "higher_is_better": true
111
+ },
112
+ {
113
+ "metric": "acc_norm",
114
+ "aggregation": "mean",
115
+ "higher_is_better": true
116
+ }
117
+ ],
118
+ "output_type": "multiple_choice",
119
+ "repeats": 1,
120
+ "should_decontaminate": false,
121
+ "metadata": {
122
+ "version": 1.0
123
+ }
124
+ },
125
+ "agieval_logiqa_en": {
126
+ "task": "agieval_logiqa_en",
127
+ "group": [
128
+ "agieval",
129
+ "agieval_nous",
130
+ "agieval_en"
131
+ ],
132
+ "dataset_path": "hails/agieval-logiqa-en",
133
+ "test_split": "test",
134
+ "doc_to_text": "{{query}}",
135
+ "doc_to_target": "{{gold}}",
136
+ "doc_to_choice": "{{choices}}",
137
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
138
+ "description": "",
139
+ "target_delimiter": " ",
140
+ "fewshot_delimiter": "\n\n",
141
+ "num_fewshot": 0,
142
+ "metric_list": [
143
+ {
144
+ "metric": "acc",
145
+ "aggregation": "mean",
146
+ "higher_is_better": true
147
+ },
148
+ {
149
+ "metric": "acc_norm",
150
+ "aggregation": "mean",
151
+ "higher_is_better": true
152
+ }
153
+ ],
154
+ "output_type": "multiple_choice",
155
+ "repeats": 1,
156
+ "should_decontaminate": false,
157
+ "metadata": {
158
+ "version": 1.0
159
+ }
160
+ },
161
+ "agieval_lsat_ar": {
162
+ "task": "agieval_lsat_ar",
163
+ "group": [
164
+ "agieval",
165
+ "agieval_nous",
166
+ "agieval_en"
167
+ ],
168
+ "dataset_path": "hails/agieval-lsat-ar",
169
+ "test_split": "test",
170
+ "doc_to_text": "{{query}}",
171
+ "doc_to_target": "{{gold}}",
172
+ "doc_to_choice": "{{choices}}",
173
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
174
+ "description": "",
175
+ "target_delimiter": " ",
176
+ "fewshot_delimiter": "\n\n",
177
+ "num_fewshot": 0,
178
+ "metric_list": [
179
+ {
180
+ "metric": "acc",
181
+ "aggregation": "mean",
182
+ "higher_is_better": true
183
+ },
184
+ {
185
+ "metric": "acc_norm",
186
+ "aggregation": "mean",
187
+ "higher_is_better": true
188
+ }
189
+ ],
190
+ "output_type": "multiple_choice",
191
+ "repeats": 1,
192
+ "should_decontaminate": false,
193
+ "metadata": {
194
+ "version": 1.0
195
+ }
196
+ },
197
+ "agieval_lsat_lr": {
198
+ "task": "agieval_lsat_lr",
199
+ "group": [
200
+ "agieval",
201
+ "agieval_nous",
202
+ "agieval_en"
203
+ ],
204
+ "dataset_path": "hails/agieval-lsat-lr",
205
+ "test_split": "test",
206
+ "doc_to_text": "{{query}}",
207
+ "doc_to_target": "{{gold}}",
208
+ "doc_to_choice": "{{choices}}",
209
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
210
+ "description": "",
211
+ "target_delimiter": " ",
212
+ "fewshot_delimiter": "\n\n",
213
+ "num_fewshot": 0,
214
+ "metric_list": [
215
+ {
216
+ "metric": "acc",
217
+ "aggregation": "mean",
218
+ "higher_is_better": true
219
+ },
220
+ {
221
+ "metric": "acc_norm",
222
+ "aggregation": "mean",
223
+ "higher_is_better": true
224
+ }
225
+ ],
226
+ "output_type": "multiple_choice",
227
+ "repeats": 1,
228
+ "should_decontaminate": false,
229
+ "metadata": {
230
+ "version": 1.0
231
+ }
232
+ },
233
+ "agieval_lsat_rc": {
234
+ "task": "agieval_lsat_rc",
235
+ "group": [
236
+ "agieval",
237
+ "agieval_nous",
238
+ "agieval_en"
239
+ ],
240
+ "dataset_path": "hails/agieval-lsat-rc",
241
+ "test_split": "test",
242
+ "doc_to_text": "{{query}}",
243
+ "doc_to_target": "{{gold}}",
244
+ "doc_to_choice": "{{choices}}",
245
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
246
+ "description": "",
247
+ "target_delimiter": " ",
248
+ "fewshot_delimiter": "\n\n",
249
+ "num_fewshot": 0,
250
+ "metric_list": [
251
+ {
252
+ "metric": "acc",
253
+ "aggregation": "mean",
254
+ "higher_is_better": true
255
+ },
256
+ {
257
+ "metric": "acc_norm",
258
+ "aggregation": "mean",
259
+ "higher_is_better": true
260
+ }
261
+ ],
262
+ "output_type": "multiple_choice",
263
+ "repeats": 1,
264
+ "should_decontaminate": false,
265
+ "metadata": {
266
+ "version": 1.0
267
+ }
268
+ },
269
+ "agieval_sat_en": {
270
+ "task": "agieval_sat_en",
271
+ "group": [
272
+ "agieval",
273
+ "agieval_nous",
274
+ "agieval_en"
275
+ ],
276
+ "dataset_path": "hails/agieval-sat-en",
277
+ "test_split": "test",
278
+ "doc_to_text": "{{query}}",
279
+ "doc_to_target": "{{gold}}",
280
+ "doc_to_choice": "{{choices}}",
281
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
282
+ "description": "",
283
+ "target_delimiter": " ",
284
+ "fewshot_delimiter": "\n\n",
285
+ "num_fewshot": 0,
286
+ "metric_list": [
287
+ {
288
+ "metric": "acc",
289
+ "aggregation": "mean",
290
+ "higher_is_better": true
291
+ },
292
+ {
293
+ "metric": "acc_norm",
294
+ "aggregation": "mean",
295
+ "higher_is_better": true
296
+ }
297
+ ],
298
+ "output_type": "multiple_choice",
299
+ "repeats": 1,
300
+ "should_decontaminate": false,
301
+ "metadata": {
302
+ "version": 1.0
303
+ }
304
+ },
305
+ "agieval_sat_en_without_passage": {
306
+ "task": "agieval_sat_en_without_passage",
307
+ "group": [
308
+ "agieval",
309
+ "agieval_nous",
310
+ "agieval_en"
311
+ ],
312
+ "dataset_path": "hails/agieval-sat-en-without-passage",
313
+ "test_split": "test",
314
+ "doc_to_text": "{{query}}",
315
+ "doc_to_target": "{{gold}}",
316
+ "doc_to_choice": "{{choices}}",
317
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
318
+ "description": "",
319
+ "target_delimiter": " ",
320
+ "fewshot_delimiter": "\n\n",
321
+ "num_fewshot": 0,
322
+ "metric_list": [
323
+ {
324
+ "metric": "acc",
325
+ "aggregation": "mean",
326
+ "higher_is_better": true
327
+ },
328
+ {
329
+ "metric": "acc_norm",
330
+ "aggregation": "mean",
331
+ "higher_is_better": true
332
+ }
333
+ ],
334
+ "output_type": "multiple_choice",
335
+ "repeats": 1,
336
+ "should_decontaminate": false,
337
+ "metadata": {
338
+ "version": 1.0
339
+ }
340
+ },
341
+ "agieval_sat_math": {
342
+ "task": "agieval_sat_math",
343
+ "group": [
344
+ "agieval",
345
+ "agieval_nous",
346
+ "agieval_en"
347
+ ],
348
+ "dataset_path": "hails/agieval-sat-math",
349
+ "test_split": "test",
350
+ "doc_to_text": "{{query}}",
351
+ "doc_to_target": "{{gold}}",
352
+ "doc_to_choice": "{{choices}}",
353
+ "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
354
+ "description": "",
355
+ "target_delimiter": " ",
356
+ "fewshot_delimiter": "\n\n",
357
+ "num_fewshot": 0,
358
+ "metric_list": [
359
+ {
360
+ "metric": "acc",
361
+ "aggregation": "mean",
362
+ "higher_is_better": true
363
+ },
364
+ {
365
+ "metric": "acc_norm",
366
+ "aggregation": "mean",
367
+ "higher_is_better": true
368
+ }
369
+ ],
370
+ "output_type": "multiple_choice",
371
+ "repeats": 1,
372
+ "should_decontaminate": false,
373
+ "metadata": {
374
+ "version": 1.0
375
+ }
376
+ }
377
+ },
378
+ "versions": {
379
+ "agieval_aqua_rat": 1.0,
380
+ "agieval_logiqa_en": 1.0,
381
+ "agieval_lsat_ar": 1.0,
382
+ "agieval_lsat_lr": 1.0,
383
+ "agieval_lsat_rc": 1.0,
384
+ "agieval_sat_en": 1.0,
385
+ "agieval_sat_en_without_passage": 1.0,
386
+ "agieval_sat_math": 1.0
387
+ },
388
+ "n-shot": {
389
+ "agieval_aqua_rat": 0,
390
+ "agieval_logiqa_en": 0,
391
+ "agieval_lsat_ar": 0,
392
+ "agieval_lsat_lr": 0,
393
+ "agieval_lsat_rc": 0,
394
+ "agieval_nous": 0,
395
+ "agieval_sat_en": 0,
396
+ "agieval_sat_en_without_passage": 0,
397
+ "agieval_sat_math": 0
398
+ },
399
+ "higher_is_better": {
400
+ "agieval_aqua_rat": {
401
+ "acc": true,
402
+ "acc_norm": true
403
+ },
404
+ "agieval_logiqa_en": {
405
+ "acc": true,
406
+ "acc_norm": true
407
+ },
408
+ "agieval_lsat_ar": {
409
+ "acc": true,
410
+ "acc_norm": true
411
+ },
412
+ "agieval_lsat_lr": {
413
+ "acc": true,
414
+ "acc_norm": true
415
+ },
416
+ "agieval_lsat_rc": {
417
+ "acc": true,
418
+ "acc_norm": true
419
+ },
420
+ "agieval_nous": {
421
+ "acc": true,
422
+ "acc_norm": true
423
+ },
424
+ "agieval_sat_en": {
425
+ "acc": true,
426
+ "acc_norm": true
427
+ },
428
+ "agieval_sat_en_without_passage": {
429
+ "acc": true,
430
+ "acc_norm": true
431
+ },
432
+ "agieval_sat_math": {
433
+ "acc": true,
434
+ "acc_norm": true
435
+ }
436
+ },
437
+ "n-samples": {
438
+ "agieval_sat_en": {
439
+ "original": 206,
440
+ "effective": 206
441
+ },
442
+ "agieval_lsat_ar": {
443
+ "original": 230,
444
+ "effective": 230
445
+ },
446
+ "agieval_sat_en_without_passage": {
447
+ "original": 206,
448
+ "effective": 206
449
+ },
450
+ "agieval_aqua_rat": {
451
+ "original": 254,
452
+ "effective": 254
453
+ },
454
+ "agieval_logiqa_en": {
455
+ "original": 651,
456
+ "effective": 651
457
+ },
458
+ "agieval_sat_math": {
459
+ "original": 220,
460
+ "effective": 220
461
+ },
462
+ "agieval_lsat_rc": {
463
+ "original": 269,
464
+ "effective": 269
465
+ },
466
+ "agieval_lsat_lr": {
467
+ "original": 510,
468
+ "effective": 510
469
+ }
470
+ },
471
+ "config": {
472
+ "model": "hf",
473
+ "model_args": "pretrained=/home/migel/Tess-v2.5-qwen2-72B-safetensors,parallelize=True",
474
+ "model_num_parameters": 72706203648,
475
+ "model_dtype": "torch.float16",
476
+ "model_revision": "main",
477
+ "model_sha": "",
478
+ "batch_size": "16",
479
+ "batch_sizes": [],
480
+ "device": null,
481
+ "use_cache": null,
482
+ "limit": null,
483
+ "bootstrap_iters": 100000,
484
+ "gen_kwargs": null,
485
+ "random_seed": 0,
486
+ "numpy_seed": 1234,
487
+ "torch_seed": 1234,
488
+ "fewshot_seed": 1234
489
+ },
490
+ "git_hash": "b3e4c49a",
491
+ "date": 1718163625.5715299,
492
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.31\n\nPython version: 3.10.14 (main, Apr 6 2024, 18:45:05) [GCC 9.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1050-azure-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\nGPU 2: NVIDIA A100 80GB PCIe\nGPU 3: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 530.30.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 48 bits physical, 48 bits virtual\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nNUMA node(s): 4\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7V13 64-Core Processor\nStepping: 1\nCPU MHz: 2445.435\nBogoMIPS: 4890.87\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB\nL1i cache: 3 MiB\nL2 cache: 48 MiB\nL3 cache: 384 MiB\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] magma-cuda117 2.6.1 1 pytorch\n[conda] mkl 2022.2.1 pypi_0 pypi\n[conda] mkl-include 2022.2.1 pypi_0 pypi\n[conda] numpy 1.24.4 pypi_0 pypi\n[conda] pytorch-lightning 1.9.5 pypi_0 pypi\n[conda] torch 2.0.1 pypi_0 pypi\n[conda] torch-nebula 0.16.10 pypi_0 pypi\n[conda] torch-ort 1.17.0 pypi_0 pypi\n[conda] torchaudio 2.0.2+cu117 pypi_0 pypi\n[conda] torchdata 0.6.1 pypi_0 pypi\n[conda] torchmetrics 1.2.0 pypi_0 pypi\n[conda] torchsnapshot 0.1.0 pypi_0 pypi\n[conda] torchvision 0.15.2+cu117 pypi_0 pypi\n[conda] triton 2.0.0 pypi_0 pypi",
493
+ "transformers_version": "4.41.1",
494
+ "upper_git_hash": null,
495
+ "task_hashes": {},
496
+ "model_source": "hf",
497
+ "model_name": "/home/migel/Tess-v2.5-qwen2-72B-safetensors",
498
+ "model_name_sanitized": "__home__migel__Tess-v2.5-qwen2-72B-safetensors",
499
+ "system_instruction": null,
500
+ "system_instruction_sha": null,
501
+ "chat_template": null,
502
+ "chat_template_sha": null,
503
+ "start_time": 377200.61189737,
504
+ "end_time": 380116.891366629,
505
+ "total_evaluation_time_seconds": "2916.279469258967"
506
+ }
Evals/Tess-v2.5-Qwen2-72B-hellaswag.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "acc,none": 0.6811392152957578,
5
+ "acc_stderr,none": 0.004650825168905212,
6
+ "acc_norm,none": 0.8729336785500896,
7
+ "acc_norm_stderr,none": 0.0033236659644120307,
8
+ "alias": "hellaswag"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "group": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
24
+ "doc_to_text": "{{query}}",
25
+ "doc_to_target": "{{label}}",
26
+ "doc_to_choice": "choices",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 10,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": false,
46
+ "metadata": {
47
+ "version": 1.0
48
+ }
49
+ }
50
+ },
51
+ "versions": {
52
+ "hellaswag": 1.0
53
+ },
54
+ "n-shot": {
55
+ "hellaswag": 10
56
+ },
57
+ "higher_is_better": {
58
+ "hellaswag": {
59
+ "acc": true,
60
+ "acc_norm": true
61
+ }
62
+ },
63
+ "n-samples": {
64
+ "hellaswag": {
65
+ "original": 10042,
66
+ "effective": 10042
67
+ }
68
+ },
69
+ "config": {
70
+ "model": "hf",
71
+ "model_args": "pretrained=/home/migel/Tess-v2.5-qwen2-72B-safetensors,parallelize=True",
72
+ "model_num_parameters": 72706203648,
73
+ "model_dtype": "torch.float16",
74
+ "model_revision": "main",
75
+ "model_sha": "",
76
+ "batch_size": "8",
77
+ "batch_sizes": [],
78
+ "device": null,
79
+ "use_cache": null,
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "b3e4c49a",
89
+ "date": 1718190545.705119,
90
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.31\n\nPython version: 3.10.14 (main, Apr 6 2024, 18:45:05) [GCC 9.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1050-azure-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\nGPU 2: NVIDIA A100 80GB PCIe\nGPU 3: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 530.30.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 48 bits physical, 48 bits virtual\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nNUMA node(s): 4\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7V13 64-Core Processor\nStepping: 1\nCPU MHz: 2445.435\nBogoMIPS: 4890.87\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB\nL1i cache: 3 MiB\nL2 cache: 48 MiB\nL3 cache: 384 MiB\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] magma-cuda117 2.6.1 1 pytorch\n[conda] mkl 2022.2.1 pypi_0 pypi\n[conda] mkl-include 2022.2.1 pypi_0 pypi\n[conda] numpy 1.24.4 pypi_0 pypi\n[conda] pytorch-lightning 1.9.5 pypi_0 pypi\n[conda] torch 2.0.1 pypi_0 pypi\n[conda] torch-nebula 0.16.10 pypi_0 pypi\n[conda] torch-ort 1.17.0 pypi_0 pypi\n[conda] torchaudio 2.0.2+cu117 pypi_0 pypi\n[conda] torchdata 0.6.1 pypi_0 pypi\n[conda] torchmetrics 1.2.0 pypi_0 pypi\n[conda] torchsnapshot 0.1.0 pypi_0 pypi\n[conda] torchvision 0.15.2+cu117 pypi_0 pypi\n[conda] triton 2.0.0 pypi_0 pypi",
91
+ "transformers_version": "4.41.1",
92
+ "upper_git_hash": null,
93
+ "task_hashes": {},
94
+ "model_source": "hf",
95
+ "model_name": "/home/migel/Tess-v2.5-qwen2-72B-safetensors",
96
+ "model_name_sanitized": "__home__migel__Tess-v2.5-qwen2-72B-safetensors",
97
+ "system_instruction": null,
98
+ "system_instruction_sha": null,
99
+ "chat_template": null,
100
+ "chat_template_sha": null,
101
+ "start_time": 404120.678699121,
102
+ "end_time": 430406.206534399,
103
+ "total_evaluation_time_seconds": "26285.527835278015"
104
+ }
Evals/Tess-v2.5-Qwen2-72B-mmlu.json ADDED
@@ -0,0 +1,3158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu": {
4
+ "acc,none": 0.8439680957128615,
5
+ "acc_stderr,none": 0.0029499711040394372,
6
+ "alias": "mmlu"
7
+ },
8
+ "mmlu_humanities": {
9
+ "alias": " - humanities",
10
+ "acc,none": 0.8146652497343252,
11
+ "acc_stderr,none": 0.005505402478774841
12
+ },
13
+ "mmlu_formal_logic": {
14
+ "alias": " - formal_logic",
15
+ "acc,none": 0.7301587301587301,
16
+ "acc_stderr,none": 0.03970158273235173
17
+ },
18
+ "mmlu_high_school_european_history": {
19
+ "alias": " - high_school_european_history",
20
+ "acc,none": 0.8909090909090909,
21
+ "acc_stderr,none": 0.02434383813514564
22
+ },
23
+ "mmlu_high_school_us_history": {
24
+ "alias": " - high_school_us_history",
25
+ "acc,none": 0.9509803921568627,
26
+ "acc_stderr,none": 0.01515383934021267
27
+ },
28
+ "mmlu_high_school_world_history": {
29
+ "alias": " - high_school_world_history",
30
+ "acc,none": 0.9409282700421941,
31
+ "acc_stderr,none": 0.01534659746388869
32
+ },
33
+ "mmlu_international_law": {
34
+ "alias": " - international_law",
35
+ "acc,none": 0.9173553719008265,
36
+ "acc_stderr,none": 0.025135382356604227
37
+ },
38
+ "mmlu_jurisprudence": {
39
+ "alias": " - jurisprudence",
40
+ "acc,none": 0.8796296296296297,
41
+ "acc_stderr,none": 0.031457038543062525
42
+ },
43
+ "mmlu_logical_fallacies": {
44
+ "alias": " - logical_fallacies",
45
+ "acc,none": 0.901840490797546,
46
+ "acc_stderr,none": 0.023376180231059605
47
+ },
48
+ "mmlu_moral_disputes": {
49
+ "alias": " - moral_disputes",
50
+ "acc,none": 0.869942196531792,
51
+ "acc_stderr,none": 0.01810939152822133
52
+ },
53
+ "mmlu_moral_scenarios": {
54
+ "alias": " - moral_scenarios",
55
+ "acc,none": 0.829050279329609,
56
+ "acc_stderr,none": 0.012590873868789222
57
+ },
58
+ "mmlu_philosophy": {
59
+ "alias": " - philosophy",
60
+ "acc,none": 0.8713826366559485,
61
+ "acc_stderr,none": 0.01901399630412152
62
+ },
63
+ "mmlu_prehistory": {
64
+ "alias": " - prehistory",
65
+ "acc,none": 0.9104938271604939,
66
+ "acc_stderr,none": 0.015884141073937555
67
+ },
68
+ "mmlu_professional_law": {
69
+ "alias": " - professional_law",
70
+ "acc,none": 0.6929595827900913,
71
+ "acc_stderr,none": 0.011780959114513764
72
+ },
73
+ "mmlu_world_religions": {
74
+ "alias": " - world_religions",
75
+ "acc,none": 0.8888888888888888,
76
+ "acc_stderr,none": 0.024103384202072864
77
+ },
78
+ "mmlu_other": {
79
+ "alias": " - other",
80
+ "acc,none": 0.8625683939491471,
81
+ "acc_stderr,none": 0.005895325056685939
82
+ },
83
+ "mmlu_business_ethics": {
84
+ "alias": " - business_ethics",
85
+ "acc,none": 0.78,
86
+ "acc_stderr,none": 0.04163331998932263
87
+ },
88
+ "mmlu_clinical_knowledge": {
89
+ "alias": " - clinical_knowledge",
90
+ "acc,none": 0.8716981132075472,
91
+ "acc_stderr,none": 0.02058247568799185
92
+ },
93
+ "mmlu_college_medicine": {
94
+ "alias": " - college_medicine",
95
+ "acc,none": 0.8323699421965318,
96
+ "acc_stderr,none": 0.028481963032143395
97
+ },
98
+ "mmlu_global_facts": {
99
+ "alias": " - global_facts",
100
+ "acc,none": 0.61,
101
+ "acc_stderr,none": 0.04902071300001975
102
+ },
103
+ "mmlu_human_aging": {
104
+ "alias": " - human_aging",
105
+ "acc,none": 0.8565022421524664,
106
+ "acc_stderr,none": 0.0235293712696182
107
+ },
108
+ "mmlu_management": {
109
+ "alias": " - management",
110
+ "acc,none": 0.9223300970873787,
111
+ "acc_stderr,none": 0.026501440784762766
112
+ },
113
+ "mmlu_marketing": {
114
+ "alias": " - marketing",
115
+ "acc,none": 0.9487179487179487,
116
+ "acc_stderr,none": 0.014450181176872726
117
+ },
118
+ "mmlu_medical_genetics": {
119
+ "alias": " - medical_genetics",
120
+ "acc,none": 0.9,
121
+ "acc_stderr,none": 0.030151134457776348
122
+ },
123
+ "mmlu_miscellaneous": {
124
+ "alias": " - miscellaneous",
125
+ "acc,none": 0.9501915708812261,
126
+ "acc_stderr,none": 0.0077795348866793465
127
+ },
128
+ "mmlu_nutrition": {
129
+ "alias": " - nutrition",
130
+ "acc,none": 0.9019607843137255,
131
+ "acc_stderr,none": 0.017027222935582193
132
+ },
133
+ "mmlu_professional_accounting": {
134
+ "alias": " - professional_accounting",
135
+ "acc,none": 0.75177304964539,
136
+ "acc_stderr,none": 0.025770015644290392
137
+ },
138
+ "mmlu_professional_medicine": {
139
+ "alias": " - professional_medicine",
140
+ "acc,none": 0.8897058823529411,
141
+ "acc_stderr,none": 0.019028947191474497
142
+ },
143
+ "mmlu_virology": {
144
+ "alias": " - virology",
145
+ "acc,none": 0.5662650602409639,
146
+ "acc_stderr,none": 0.03858158940685517
147
+ },
148
+ "mmlu_social_sciences": {
149
+ "alias": " - social_sciences",
150
+ "acc,none": 0.9038024049398765,
151
+ "acc_stderr,none": 0.005221504585802578
152
+ },
153
+ "mmlu_econometrics": {
154
+ "alias": " - econometrics",
155
+ "acc,none": 0.7280701754385965,
156
+ "acc_stderr,none": 0.041857744240220554
157
+ },
158
+ "mmlu_high_school_geography": {
159
+ "alias": " - high_school_geography",
160
+ "acc,none": 0.9393939393939394,
161
+ "acc_stderr,none": 0.016999994927421606
162
+ },
163
+ "mmlu_high_school_government_and_politics": {
164
+ "alias": " - high_school_government_and_politics",
165
+ "acc,none": 0.9896373056994818,
166
+ "acc_stderr,none": 0.007308424386792201
167
+ },
168
+ "mmlu_high_school_macroeconomics": {
169
+ "alias": " - high_school_macroeconomics",
170
+ "acc,none": 0.8897435897435897,
171
+ "acc_stderr,none": 0.015880331261056115
172
+ },
173
+ "mmlu_high_school_microeconomics": {
174
+ "alias": " - high_school_microeconomics",
175
+ "acc,none": 0.9411764705882353,
176
+ "acc_stderr,none": 0.015283995352038402
177
+ },
178
+ "mmlu_high_school_psychology": {
179
+ "alias": " - high_school_psychology",
180
+ "acc,none": 0.9357798165137615,
181
+ "acc_stderr,none": 0.010510494713201424
182
+ },
183
+ "mmlu_human_sexuality": {
184
+ "alias": " - human_sexuality",
185
+ "acc,none": 0.9083969465648855,
186
+ "acc_stderr,none": 0.025300035578642965
187
+ },
188
+ "mmlu_professional_psychology": {
189
+ "alias": " - professional_psychology",
190
+ "acc,none": 0.8970588235294118,
191
+ "acc_stderr,none": 0.012293751200845176
192
+ },
193
+ "mmlu_public_relations": {
194
+ "alias": " - public_relations",
195
+ "acc,none": 0.7454545454545455,
196
+ "acc_stderr,none": 0.041723430387053825
197
+ },
198
+ "mmlu_security_studies": {
199
+ "alias": " - security_studies",
200
+ "acc,none": 0.8408163265306122,
201
+ "acc_stderr,none": 0.023420972069166365
202
+ },
203
+ "mmlu_sociology": {
204
+ "alias": " - sociology",
205
+ "acc,none": 0.945273631840796,
206
+ "acc_stderr,none": 0.016082815796263254
207
+ },
208
+ "mmlu_us_foreign_policy": {
209
+ "alias": " - us_foreign_policy",
210
+ "acc,none": 0.94,
211
+ "acc_stderr,none": 0.02386832565759419
212
+ },
213
+ "mmlu_stem": {
214
+ "alias": " - stem",
215
+ "acc,none": 0.8109736758642563,
216
+ "acc_stderr,none": 0.0067376135296805745
217
+ },
218
+ "mmlu_abstract_algebra": {
219
+ "alias": " - abstract_algebra",
220
+ "acc,none": 0.66,
221
+ "acc_stderr,none": 0.04760952285695237
222
+ },
223
+ "mmlu_anatomy": {
224
+ "alias": " - anatomy",
225
+ "acc,none": 0.7925925925925926,
226
+ "acc_stderr,none": 0.03502553170678317
227
+ },
228
+ "mmlu_astronomy": {
229
+ "alias": " - astronomy",
230
+ "acc,none": 0.9276315789473685,
231
+ "acc_stderr,none": 0.021085011261884112
232
+ },
233
+ "mmlu_college_biology": {
234
+ "alias": " - college_biology",
235
+ "acc,none": 0.9444444444444444,
236
+ "acc_stderr,none": 0.01915507853243362
237
+ },
238
+ "mmlu_college_chemistry": {
239
+ "alias": " - college_chemistry",
240
+ "acc,none": 0.58,
241
+ "acc_stderr,none": 0.049604496374885836
242
+ },
243
+ "mmlu_college_computer_science": {
244
+ "alias": " - college_computer_science",
245
+ "acc,none": 0.8,
246
+ "acc_stderr,none": 0.040201512610368445
247
+ },
248
+ "mmlu_college_mathematics": {
249
+ "alias": " - college_mathematics",
250
+ "acc,none": 0.63,
251
+ "acc_stderr,none": 0.04852365870939099
252
+ },
253
+ "mmlu_college_physics": {
254
+ "alias": " - college_physics",
255
+ "acc,none": 0.6470588235294118,
256
+ "acc_stderr,none": 0.04755129616062947
257
+ },
258
+ "mmlu_computer_security": {
259
+ "alias": " - computer_security",
260
+ "acc,none": 0.83,
261
+ "acc_stderr,none": 0.0377525168068637
262
+ },
263
+ "mmlu_conceptual_physics": {
264
+ "alias": " - conceptual_physics",
265
+ "acc,none": 0.8893617021276595,
266
+ "acc_stderr,none": 0.020506145099008433
267
+ },
268
+ "mmlu_electrical_engineering": {
269
+ "alias": " - electrical_engineering",
270
+ "acc,none": 0.8275862068965517,
271
+ "acc_stderr,none": 0.03147830790259575
272
+ },
273
+ "mmlu_elementary_mathematics": {
274
+ "alias": " - elementary_mathematics",
275
+ "acc,none": 0.8888888888888888,
276
+ "acc_stderr,none": 0.01618571201620511
277
+ },
278
+ "mmlu_high_school_biology": {
279
+ "alias": " - high_school_biology",
280
+ "acc,none": 0.9419354838709677,
281
+ "acc_stderr,none": 0.01330413811280927
282
+ },
283
+ "mmlu_high_school_chemistry": {
284
+ "alias": " - high_school_chemistry",
285
+ "acc,none": 0.7980295566502463,
286
+ "acc_stderr,none": 0.028247350122180243
287
+ },
288
+ "mmlu_high_school_computer_science": {
289
+ "alias": " - high_school_computer_science",
290
+ "acc,none": 0.91,
291
+ "acc_stderr,none": 0.028762349126466115
292
+ },
293
+ "mmlu_high_school_mathematics": {
294
+ "alias": " - high_school_mathematics",
295
+ "acc,none": 0.6777777777777778,
296
+ "acc_stderr,none": 0.028493465091028597
297
+ },
298
+ "mmlu_high_school_physics": {
299
+ "alias": " - high_school_physics",
300
+ "acc,none": 0.7284768211920529,
301
+ "acc_stderr,none": 0.03631329803969654
302
+ },
303
+ "mmlu_high_school_statistics": {
304
+ "alias": " - high_school_statistics",
305
+ "acc,none": 0.7824074074074074,
306
+ "acc_stderr,none": 0.028139689444859676
307
+ },
308
+ "mmlu_machine_learning": {
309
+ "alias": " - machine_learning",
310
+ "acc,none": 0.7589285714285714,
311
+ "acc_stderr,none": 0.04059867246952685
312
+ }
313
+ },
314
+ "groups": {
315
+ "mmlu": {
316
+ "acc,none": 0.8439680957128615,
317
+ "acc_stderr,none": 0.0029499711040394372,
318
+ "alias": "mmlu"
319
+ },
320
+ "mmlu_humanities": {
321
+ "alias": " - humanities",
322
+ "acc,none": 0.8146652497343252,
323
+ "acc_stderr,none": 0.005505402478774841
324
+ },
325
+ "mmlu_other": {
326
+ "alias": " - other",
327
+ "acc,none": 0.8625683939491471,
328
+ "acc_stderr,none": 0.005895325056685939
329
+ },
330
+ "mmlu_social_sciences": {
331
+ "alias": " - social_sciences",
332
+ "acc,none": 0.9038024049398765,
333
+ "acc_stderr,none": 0.005221504585802578
334
+ },
335
+ "mmlu_stem": {
336
+ "alias": " - stem",
337
+ "acc,none": 0.8109736758642563,
338
+ "acc_stderr,none": 0.0067376135296805745
339
+ }
340
+ },
341
+ "group_subtasks": {
342
+ "mmlu_stem": [
343
+ "mmlu_college_biology",
344
+ "mmlu_high_school_computer_science",
345
+ "mmlu_elementary_mathematics",
346
+ "mmlu_astronomy",
347
+ "mmlu_machine_learning",
348
+ "mmlu_high_school_mathematics",
349
+ "mmlu_electrical_engineering",
350
+ "mmlu_college_chemistry",
351
+ "mmlu_college_mathematics",
352
+ "mmlu_high_school_statistics",
353
+ "mmlu_high_school_biology",
354
+ "mmlu_abstract_algebra",
355
+ "mmlu_college_physics",
356
+ "mmlu_conceptual_physics",
357
+ "mmlu_computer_security",
358
+ "mmlu_anatomy",
359
+ "mmlu_college_computer_science",
360
+ "mmlu_high_school_physics",
361
+ "mmlu_high_school_chemistry"
362
+ ],
363
+ "mmlu_other": [
364
+ "mmlu_marketing",
365
+ "mmlu_professional_accounting",
366
+ "mmlu_clinical_knowledge",
367
+ "mmlu_college_medicine",
368
+ "mmlu_miscellaneous",
369
+ "mmlu_virology",
370
+ "mmlu_business_ethics",
371
+ "mmlu_professional_medicine",
372
+ "mmlu_global_facts",
373
+ "mmlu_nutrition",
374
+ "mmlu_human_aging",
375
+ "mmlu_management",
376
+ "mmlu_medical_genetics"
377
+ ],
378
+ "mmlu_social_sciences": [
379
+ "mmlu_high_school_psychology",
380
+ "mmlu_high_school_geography",
381
+ "mmlu_high_school_macroeconomics",
382
+ "mmlu_public_relations",
383
+ "mmlu_security_studies",
384
+ "mmlu_high_school_microeconomics",
385
+ "mmlu_human_sexuality",
386
+ "mmlu_sociology",
387
+ "mmlu_professional_psychology",
388
+ "mmlu_econometrics",
389
+ "mmlu_us_foreign_policy",
390
+ "mmlu_high_school_government_and_politics"
391
+ ],
392
+ "mmlu_humanities": [
393
+ "mmlu_moral_scenarios",
394
+ "mmlu_high_school_us_history",
395
+ "mmlu_high_school_world_history",
396
+ "mmlu_world_religions",
397
+ "mmlu_formal_logic",
398
+ "mmlu_moral_disputes",
399
+ "mmlu_prehistory",
400
+ "mmlu_international_law",
401
+ "mmlu_logical_fallacies",
402
+ "mmlu_professional_law",
403
+ "mmlu_philosophy",
404
+ "mmlu_high_school_european_history",
405
+ "mmlu_jurisprudence"
406
+ ],
407
+ "mmlu": [
408
+ "mmlu_humanities",
409
+ "mmlu_social_sciences",
410
+ "mmlu_other",
411
+ "mmlu_stem"
412
+ ]
413
+ },
414
+ "configs": {
415
+ "mmlu_abstract_algebra": {
416
+ "task": "mmlu_abstract_algebra",
417
+ "task_alias": "abstract_algebra",
418
+ "group": "mmlu_stem",
419
+ "group_alias": "stem",
420
+ "dataset_path": "hails/mmlu_no_train",
421
+ "dataset_name": "abstract_algebra",
422
+ "test_split": "test",
423
+ "fewshot_split": "dev",
424
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
425
+ "doc_to_target": "answer",
426
+ "doc_to_choice": [
427
+ "A",
428
+ "B",
429
+ "C",
430
+ "D"
431
+ ],
432
+ "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
433
+ "target_delimiter": " ",
434
+ "fewshot_delimiter": "\n\n",
435
+ "fewshot_config": {
436
+ "sampler": "first_n"
437
+ },
438
+ "num_fewshot": 5,
439
+ "metric_list": [
440
+ {
441
+ "metric": "acc",
442
+ "aggregation": "mean",
443
+ "higher_is_better": true
444
+ }
445
+ ],
446
+ "output_type": "multiple_choice",
447
+ "repeats": 1,
448
+ "should_decontaminate": false,
449
+ "metadata": {
450
+ "version": 0.0
451
+ }
452
+ },
453
+ "mmlu_anatomy": {
454
+ "task": "mmlu_anatomy",
455
+ "task_alias": "anatomy",
456
+ "group": "mmlu_stem",
457
+ "group_alias": "stem",
458
+ "dataset_path": "hails/mmlu_no_train",
459
+ "dataset_name": "anatomy",
460
+ "test_split": "test",
461
+ "fewshot_split": "dev",
462
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
463
+ "doc_to_target": "answer",
464
+ "doc_to_choice": [
465
+ "A",
466
+ "B",
467
+ "C",
468
+ "D"
469
+ ],
470
+ "description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
471
+ "target_delimiter": " ",
472
+ "fewshot_delimiter": "\n\n",
473
+ "fewshot_config": {
474
+ "sampler": "first_n"
475
+ },
476
+ "num_fewshot": 5,
477
+ "metric_list": [
478
+ {
479
+ "metric": "acc",
480
+ "aggregation": "mean",
481
+ "higher_is_better": true
482
+ }
483
+ ],
484
+ "output_type": "multiple_choice",
485
+ "repeats": 1,
486
+ "should_decontaminate": false,
487
+ "metadata": {
488
+ "version": 0.0
489
+ }
490
+ },
491
+ "mmlu_astronomy": {
492
+ "task": "mmlu_astronomy",
493
+ "task_alias": "astronomy",
494
+ "group": "mmlu_stem",
495
+ "group_alias": "stem",
496
+ "dataset_path": "hails/mmlu_no_train",
497
+ "dataset_name": "astronomy",
498
+ "test_split": "test",
499
+ "fewshot_split": "dev",
500
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
501
+ "doc_to_target": "answer",
502
+ "doc_to_choice": [
503
+ "A",
504
+ "B",
505
+ "C",
506
+ "D"
507
+ ],
508
+ "description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
509
+ "target_delimiter": " ",
510
+ "fewshot_delimiter": "\n\n",
511
+ "fewshot_config": {
512
+ "sampler": "first_n"
513
+ },
514
+ "num_fewshot": 5,
515
+ "metric_list": [
516
+ {
517
+ "metric": "acc",
518
+ "aggregation": "mean",
519
+ "higher_is_better": true
520
+ }
521
+ ],
522
+ "output_type": "multiple_choice",
523
+ "repeats": 1,
524
+ "should_decontaminate": false,
525
+ "metadata": {
526
+ "version": 0.0
527
+ }
528
+ },
529
+ "mmlu_business_ethics": {
530
+ "task": "mmlu_business_ethics",
531
+ "task_alias": "business_ethics",
532
+ "group": "mmlu_other",
533
+ "group_alias": "other",
534
+ "dataset_path": "hails/mmlu_no_train",
535
+ "dataset_name": "business_ethics",
536
+ "test_split": "test",
537
+ "fewshot_split": "dev",
538
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
539
+ "doc_to_target": "answer",
540
+ "doc_to_choice": [
541
+ "A",
542
+ "B",
543
+ "C",
544
+ "D"
545
+ ],
546
+ "description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
547
+ "target_delimiter": " ",
548
+ "fewshot_delimiter": "\n\n",
549
+ "fewshot_config": {
550
+ "sampler": "first_n"
551
+ },
552
+ "num_fewshot": 5,
553
+ "metric_list": [
554
+ {
555
+ "metric": "acc",
556
+ "aggregation": "mean",
557
+ "higher_is_better": true
558
+ }
559
+ ],
560
+ "output_type": "multiple_choice",
561
+ "repeats": 1,
562
+ "should_decontaminate": false,
563
+ "metadata": {
564
+ "version": 0.0
565
+ }
566
+ },
567
+ "mmlu_clinical_knowledge": {
568
+ "task": "mmlu_clinical_knowledge",
569
+ "task_alias": "clinical_knowledge",
570
+ "group": "mmlu_other",
571
+ "group_alias": "other",
572
+ "dataset_path": "hails/mmlu_no_train",
573
+ "dataset_name": "clinical_knowledge",
574
+ "test_split": "test",
575
+ "fewshot_split": "dev",
576
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
577
+ "doc_to_target": "answer",
578
+ "doc_to_choice": [
579
+ "A",
580
+ "B",
581
+ "C",
582
+ "D"
583
+ ],
584
+ "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
585
+ "target_delimiter": " ",
586
+ "fewshot_delimiter": "\n\n",
587
+ "fewshot_config": {
588
+ "sampler": "first_n"
589
+ },
590
+ "num_fewshot": 5,
591
+ "metric_list": [
592
+ {
593
+ "metric": "acc",
594
+ "aggregation": "mean",
595
+ "higher_is_better": true
596
+ }
597
+ ],
598
+ "output_type": "multiple_choice",
599
+ "repeats": 1,
600
+ "should_decontaminate": false,
601
+ "metadata": {
602
+ "version": 0.0
603
+ }
604
+ },
605
+ "mmlu_college_biology": {
606
+ "task": "mmlu_college_biology",
607
+ "task_alias": "college_biology",
608
+ "group": "mmlu_stem",
609
+ "group_alias": "stem",
610
+ "dataset_path": "hails/mmlu_no_train",
611
+ "dataset_name": "college_biology",
612
+ "test_split": "test",
613
+ "fewshot_split": "dev",
614
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
615
+ "doc_to_target": "answer",
616
+ "doc_to_choice": [
617
+ "A",
618
+ "B",
619
+ "C",
620
+ "D"
621
+ ],
622
+ "description": "The following are multiple choice questions (with answers) about college biology.\n\n",
623
+ "target_delimiter": " ",
624
+ "fewshot_delimiter": "\n\n",
625
+ "fewshot_config": {
626
+ "sampler": "first_n"
627
+ },
628
+ "num_fewshot": 5,
629
+ "metric_list": [
630
+ {
631
+ "metric": "acc",
632
+ "aggregation": "mean",
633
+ "higher_is_better": true
634
+ }
635
+ ],
636
+ "output_type": "multiple_choice",
637
+ "repeats": 1,
638
+ "should_decontaminate": false,
639
+ "metadata": {
640
+ "version": 0.0
641
+ }
642
+ },
643
+ "mmlu_college_chemistry": {
644
+ "task": "mmlu_college_chemistry",
645
+ "task_alias": "college_chemistry",
646
+ "group": "mmlu_stem",
647
+ "group_alias": "stem",
648
+ "dataset_path": "hails/mmlu_no_train",
649
+ "dataset_name": "college_chemistry",
650
+ "test_split": "test",
651
+ "fewshot_split": "dev",
652
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
653
+ "doc_to_target": "answer",
654
+ "doc_to_choice": [
655
+ "A",
656
+ "B",
657
+ "C",
658
+ "D"
659
+ ],
660
+ "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
661
+ "target_delimiter": " ",
662
+ "fewshot_delimiter": "\n\n",
663
+ "fewshot_config": {
664
+ "sampler": "first_n"
665
+ },
666
+ "num_fewshot": 5,
667
+ "metric_list": [
668
+ {
669
+ "metric": "acc",
670
+ "aggregation": "mean",
671
+ "higher_is_better": true
672
+ }
673
+ ],
674
+ "output_type": "multiple_choice",
675
+ "repeats": 1,
676
+ "should_decontaminate": false,
677
+ "metadata": {
678
+ "version": 0.0
679
+ }
680
+ },
681
+ "mmlu_college_computer_science": {
682
+ "task": "mmlu_college_computer_science",
683
+ "task_alias": "college_computer_science",
684
+ "group": "mmlu_stem",
685
+ "group_alias": "stem",
686
+ "dataset_path": "hails/mmlu_no_train",
687
+ "dataset_name": "college_computer_science",
688
+ "test_split": "test",
689
+ "fewshot_split": "dev",
690
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
691
+ "doc_to_target": "answer",
692
+ "doc_to_choice": [
693
+ "A",
694
+ "B",
695
+ "C",
696
+ "D"
697
+ ],
698
+ "description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
699
+ "target_delimiter": " ",
700
+ "fewshot_delimiter": "\n\n",
701
+ "fewshot_config": {
702
+ "sampler": "first_n"
703
+ },
704
+ "num_fewshot": 5,
705
+ "metric_list": [
706
+ {
707
+ "metric": "acc",
708
+ "aggregation": "mean",
709
+ "higher_is_better": true
710
+ }
711
+ ],
712
+ "output_type": "multiple_choice",
713
+ "repeats": 1,
714
+ "should_decontaminate": false,
715
+ "metadata": {
716
+ "version": 0.0
717
+ }
718
+ },
719
+ "mmlu_college_mathematics": {
720
+ "task": "mmlu_college_mathematics",
721
+ "task_alias": "college_mathematics",
722
+ "group": "mmlu_stem",
723
+ "group_alias": "stem",
724
+ "dataset_path": "hails/mmlu_no_train",
725
+ "dataset_name": "college_mathematics",
726
+ "test_split": "test",
727
+ "fewshot_split": "dev",
728
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
729
+ "doc_to_target": "answer",
730
+ "doc_to_choice": [
731
+ "A",
732
+ "B",
733
+ "C",
734
+ "D"
735
+ ],
736
+ "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
737
+ "target_delimiter": " ",
738
+ "fewshot_delimiter": "\n\n",
739
+ "fewshot_config": {
740
+ "sampler": "first_n"
741
+ },
742
+ "num_fewshot": 5,
743
+ "metric_list": [
744
+ {
745
+ "metric": "acc",
746
+ "aggregation": "mean",
747
+ "higher_is_better": true
748
+ }
749
+ ],
750
+ "output_type": "multiple_choice",
751
+ "repeats": 1,
752
+ "should_decontaminate": false,
753
+ "metadata": {
754
+ "version": 0.0
755
+ }
756
+ },
757
+ "mmlu_college_medicine": {
758
+ "task": "mmlu_college_medicine",
759
+ "task_alias": "college_medicine",
760
+ "group": "mmlu_other",
761
+ "group_alias": "other",
762
+ "dataset_path": "hails/mmlu_no_train",
763
+ "dataset_name": "college_medicine",
764
+ "test_split": "test",
765
+ "fewshot_split": "dev",
766
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
767
+ "doc_to_target": "answer",
768
+ "doc_to_choice": [
769
+ "A",
770
+ "B",
771
+ "C",
772
+ "D"
773
+ ],
774
+ "description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
775
+ "target_delimiter": " ",
776
+ "fewshot_delimiter": "\n\n",
777
+ "fewshot_config": {
778
+ "sampler": "first_n"
779
+ },
780
+ "num_fewshot": 5,
781
+ "metric_list": [
782
+ {
783
+ "metric": "acc",
784
+ "aggregation": "mean",
785
+ "higher_is_better": true
786
+ }
787
+ ],
788
+ "output_type": "multiple_choice",
789
+ "repeats": 1,
790
+ "should_decontaminate": false,
791
+ "metadata": {
792
+ "version": 0.0
793
+ }
794
+ },
795
+ "mmlu_college_physics": {
796
+ "task": "mmlu_college_physics",
797
+ "task_alias": "college_physics",
798
+ "group": "mmlu_stem",
799
+ "group_alias": "stem",
800
+ "dataset_path": "hails/mmlu_no_train",
801
+ "dataset_name": "college_physics",
802
+ "test_split": "test",
803
+ "fewshot_split": "dev",
804
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
805
+ "doc_to_target": "answer",
806
+ "doc_to_choice": [
807
+ "A",
808
+ "B",
809
+ "C",
810
+ "D"
811
+ ],
812
+ "description": "The following are multiple choice questions (with answers) about college physics.\n\n",
813
+ "target_delimiter": " ",
814
+ "fewshot_delimiter": "\n\n",
815
+ "fewshot_config": {
816
+ "sampler": "first_n"
817
+ },
818
+ "num_fewshot": 5,
819
+ "metric_list": [
820
+ {
821
+ "metric": "acc",
822
+ "aggregation": "mean",
823
+ "higher_is_better": true
824
+ }
825
+ ],
826
+ "output_type": "multiple_choice",
827
+ "repeats": 1,
828
+ "should_decontaminate": false,
829
+ "metadata": {
830
+ "version": 0.0
831
+ }
832
+ },
833
+ "mmlu_computer_security": {
834
+ "task": "mmlu_computer_security",
835
+ "task_alias": "computer_security",
836
+ "group": "mmlu_stem",
837
+ "group_alias": "stem",
838
+ "dataset_path": "hails/mmlu_no_train",
839
+ "dataset_name": "computer_security",
840
+ "test_split": "test",
841
+ "fewshot_split": "dev",
842
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
843
+ "doc_to_target": "answer",
844
+ "doc_to_choice": [
845
+ "A",
846
+ "B",
847
+ "C",
848
+ "D"
849
+ ],
850
+ "description": "The following are multiple choice questions (with answers) about computer security.\n\n",
851
+ "target_delimiter": " ",
852
+ "fewshot_delimiter": "\n\n",
853
+ "fewshot_config": {
854
+ "sampler": "first_n"
855
+ },
856
+ "num_fewshot": 5,
857
+ "metric_list": [
858
+ {
859
+ "metric": "acc",
860
+ "aggregation": "mean",
861
+ "higher_is_better": true
862
+ }
863
+ ],
864
+ "output_type": "multiple_choice",
865
+ "repeats": 1,
866
+ "should_decontaminate": false,
867
+ "metadata": {
868
+ "version": 0.0
869
+ }
870
+ },
871
+ "mmlu_conceptual_physics": {
872
+ "task": "mmlu_conceptual_physics",
873
+ "task_alias": "conceptual_physics",
874
+ "group": "mmlu_stem",
875
+ "group_alias": "stem",
876
+ "dataset_path": "hails/mmlu_no_train",
877
+ "dataset_name": "conceptual_physics",
878
+ "test_split": "test",
879
+ "fewshot_split": "dev",
880
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
881
+ "doc_to_target": "answer",
882
+ "doc_to_choice": [
883
+ "A",
884
+ "B",
885
+ "C",
886
+ "D"
887
+ ],
888
+ "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
889
+ "target_delimiter": " ",
890
+ "fewshot_delimiter": "\n\n",
891
+ "fewshot_config": {
892
+ "sampler": "first_n"
893
+ },
894
+ "num_fewshot": 5,
895
+ "metric_list": [
896
+ {
897
+ "metric": "acc",
898
+ "aggregation": "mean",
899
+ "higher_is_better": true
900
+ }
901
+ ],
902
+ "output_type": "multiple_choice",
903
+ "repeats": 1,
904
+ "should_decontaminate": false,
905
+ "metadata": {
906
+ "version": 0.0
907
+ }
908
+ },
909
+ "mmlu_econometrics": {
910
+ "task": "mmlu_econometrics",
911
+ "task_alias": "econometrics",
912
+ "group": "mmlu_social_sciences",
913
+ "group_alias": "social_sciences",
914
+ "dataset_path": "hails/mmlu_no_train",
915
+ "dataset_name": "econometrics",
916
+ "test_split": "test",
917
+ "fewshot_split": "dev",
918
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
919
+ "doc_to_target": "answer",
920
+ "doc_to_choice": [
921
+ "A",
922
+ "B",
923
+ "C",
924
+ "D"
925
+ ],
926
+ "description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
927
+ "target_delimiter": " ",
928
+ "fewshot_delimiter": "\n\n",
929
+ "fewshot_config": {
930
+ "sampler": "first_n"
931
+ },
932
+ "num_fewshot": 5,
933
+ "metric_list": [
934
+ {
935
+ "metric": "acc",
936
+ "aggregation": "mean",
937
+ "higher_is_better": true
938
+ }
939
+ ],
940
+ "output_type": "multiple_choice",
941
+ "repeats": 1,
942
+ "should_decontaminate": false,
943
+ "metadata": {
944
+ "version": 0.0
945
+ }
946
+ },
947
+ "mmlu_electrical_engineering": {
948
+ "task": "mmlu_electrical_engineering",
949
+ "task_alias": "electrical_engineering",
950
+ "group": "mmlu_stem",
951
+ "group_alias": "stem",
952
+ "dataset_path": "hails/mmlu_no_train",
953
+ "dataset_name": "electrical_engineering",
954
+ "test_split": "test",
955
+ "fewshot_split": "dev",
956
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
957
+ "doc_to_target": "answer",
958
+ "doc_to_choice": [
959
+ "A",
960
+ "B",
961
+ "C",
962
+ "D"
963
+ ],
964
+ "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
965
+ "target_delimiter": " ",
966
+ "fewshot_delimiter": "\n\n",
967
+ "fewshot_config": {
968
+ "sampler": "first_n"
969
+ },
970
+ "num_fewshot": 5,
971
+ "metric_list": [
972
+ {
973
+ "metric": "acc",
974
+ "aggregation": "mean",
975
+ "higher_is_better": true
976
+ }
977
+ ],
978
+ "output_type": "multiple_choice",
979
+ "repeats": 1,
980
+ "should_decontaminate": false,
981
+ "metadata": {
982
+ "version": 0.0
983
+ }
984
+ },
985
+ "mmlu_elementary_mathematics": {
986
+ "task": "mmlu_elementary_mathematics",
987
+ "task_alias": "elementary_mathematics",
988
+ "group": "mmlu_stem",
989
+ "group_alias": "stem",
990
+ "dataset_path": "hails/mmlu_no_train",
991
+ "dataset_name": "elementary_mathematics",
992
+ "test_split": "test",
993
+ "fewshot_split": "dev",
994
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
995
+ "doc_to_target": "answer",
996
+ "doc_to_choice": [
997
+ "A",
998
+ "B",
999
+ "C",
1000
+ "D"
1001
+ ],
1002
+ "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
1003
+ "target_delimiter": " ",
1004
+ "fewshot_delimiter": "\n\n",
1005
+ "fewshot_config": {
1006
+ "sampler": "first_n"
1007
+ },
1008
+ "num_fewshot": 5,
1009
+ "metric_list": [
1010
+ {
1011
+ "metric": "acc",
1012
+ "aggregation": "mean",
1013
+ "higher_is_better": true
1014
+ }
1015
+ ],
1016
+ "output_type": "multiple_choice",
1017
+ "repeats": 1,
1018
+ "should_decontaminate": false,
1019
+ "metadata": {
1020
+ "version": 0.0
1021
+ }
1022
+ },
1023
+ "mmlu_formal_logic": {
1024
+ "task": "mmlu_formal_logic",
1025
+ "task_alias": "formal_logic",
1026
+ "group": "mmlu_humanities",
1027
+ "group_alias": "humanities",
1028
+ "dataset_path": "hails/mmlu_no_train",
1029
+ "dataset_name": "formal_logic",
1030
+ "test_split": "test",
1031
+ "fewshot_split": "dev",
1032
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1033
+ "doc_to_target": "answer",
1034
+ "doc_to_choice": [
1035
+ "A",
1036
+ "B",
1037
+ "C",
1038
+ "D"
1039
+ ],
1040
+ "description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
1041
+ "target_delimiter": " ",
1042
+ "fewshot_delimiter": "\n\n",
1043
+ "fewshot_config": {
1044
+ "sampler": "first_n"
1045
+ },
1046
+ "num_fewshot": 5,
1047
+ "metric_list": [
1048
+ {
1049
+ "metric": "acc",
1050
+ "aggregation": "mean",
1051
+ "higher_is_better": true
1052
+ }
1053
+ ],
1054
+ "output_type": "multiple_choice",
1055
+ "repeats": 1,
1056
+ "should_decontaminate": false,
1057
+ "metadata": {
1058
+ "version": 0.0
1059
+ }
1060
+ },
1061
+ "mmlu_global_facts": {
1062
+ "task": "mmlu_global_facts",
1063
+ "task_alias": "global_facts",
1064
+ "group": "mmlu_other",
1065
+ "group_alias": "other",
1066
+ "dataset_path": "hails/mmlu_no_train",
1067
+ "dataset_name": "global_facts",
1068
+ "test_split": "test",
1069
+ "fewshot_split": "dev",
1070
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1071
+ "doc_to_target": "answer",
1072
+ "doc_to_choice": [
1073
+ "A",
1074
+ "B",
1075
+ "C",
1076
+ "D"
1077
+ ],
1078
+ "description": "The following are multiple choice questions (with answers) about global facts.\n\n",
1079
+ "target_delimiter": " ",
1080
+ "fewshot_delimiter": "\n\n",
1081
+ "fewshot_config": {
1082
+ "sampler": "first_n"
1083
+ },
1084
+ "num_fewshot": 5,
1085
+ "metric_list": [
1086
+ {
1087
+ "metric": "acc",
1088
+ "aggregation": "mean",
1089
+ "higher_is_better": true
1090
+ }
1091
+ ],
1092
+ "output_type": "multiple_choice",
1093
+ "repeats": 1,
1094
+ "should_decontaminate": false,
1095
+ "metadata": {
1096
+ "version": 0.0
1097
+ }
1098
+ },
1099
+ "mmlu_high_school_biology": {
1100
+ "task": "mmlu_high_school_biology",
1101
+ "task_alias": "high_school_biology",
1102
+ "group": "mmlu_stem",
1103
+ "group_alias": "stem",
1104
+ "dataset_path": "hails/mmlu_no_train",
1105
+ "dataset_name": "high_school_biology",
1106
+ "test_split": "test",
1107
+ "fewshot_split": "dev",
1108
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1109
+ "doc_to_target": "answer",
1110
+ "doc_to_choice": [
1111
+ "A",
1112
+ "B",
1113
+ "C",
1114
+ "D"
1115
+ ],
1116
+ "description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
1117
+ "target_delimiter": " ",
1118
+ "fewshot_delimiter": "\n\n",
1119
+ "fewshot_config": {
1120
+ "sampler": "first_n"
1121
+ },
1122
+ "num_fewshot": 5,
1123
+ "metric_list": [
1124
+ {
1125
+ "metric": "acc",
1126
+ "aggregation": "mean",
1127
+ "higher_is_better": true
1128
+ }
1129
+ ],
1130
+ "output_type": "multiple_choice",
1131
+ "repeats": 1,
1132
+ "should_decontaminate": false,
1133
+ "metadata": {
1134
+ "version": 0.0
1135
+ }
1136
+ },
1137
+ "mmlu_high_school_chemistry": {
1138
+ "task": "mmlu_high_school_chemistry",
1139
+ "task_alias": "high_school_chemistry",
1140
+ "group": "mmlu_stem",
1141
+ "group_alias": "stem",
1142
+ "dataset_path": "hails/mmlu_no_train",
1143
+ "dataset_name": "high_school_chemistry",
1144
+ "test_split": "test",
1145
+ "fewshot_split": "dev",
1146
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1147
+ "doc_to_target": "answer",
1148
+ "doc_to_choice": [
1149
+ "A",
1150
+ "B",
1151
+ "C",
1152
+ "D"
1153
+ ],
1154
+ "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
1155
+ "target_delimiter": " ",
1156
+ "fewshot_delimiter": "\n\n",
1157
+ "fewshot_config": {
1158
+ "sampler": "first_n"
1159
+ },
1160
+ "num_fewshot": 5,
1161
+ "metric_list": [
1162
+ {
1163
+ "metric": "acc",
1164
+ "aggregation": "mean",
1165
+ "higher_is_better": true
1166
+ }
1167
+ ],
1168
+ "output_type": "multiple_choice",
1169
+ "repeats": 1,
1170
+ "should_decontaminate": false,
1171
+ "metadata": {
1172
+ "version": 0.0
1173
+ }
1174
+ },
1175
+ "mmlu_high_school_computer_science": {
1176
+ "task": "mmlu_high_school_computer_science",
1177
+ "task_alias": "high_school_computer_science",
1178
+ "group": "mmlu_stem",
1179
+ "group_alias": "stem",
1180
+ "dataset_path": "hails/mmlu_no_train",
1181
+ "dataset_name": "high_school_computer_science",
1182
+ "test_split": "test",
1183
+ "fewshot_split": "dev",
1184
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1185
+ "doc_to_target": "answer",
1186
+ "doc_to_choice": [
1187
+ "A",
1188
+ "B",
1189
+ "C",
1190
+ "D"
1191
+ ],
1192
+ "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
1193
+ "target_delimiter": " ",
1194
+ "fewshot_delimiter": "\n\n",
1195
+ "fewshot_config": {
1196
+ "sampler": "first_n"
1197
+ },
1198
+ "num_fewshot": 5,
1199
+ "metric_list": [
1200
+ {
1201
+ "metric": "acc",
1202
+ "aggregation": "mean",
1203
+ "higher_is_better": true
1204
+ }
1205
+ ],
1206
+ "output_type": "multiple_choice",
1207
+ "repeats": 1,
1208
+ "should_decontaminate": false,
1209
+ "metadata": {
1210
+ "version": 0.0
1211
+ }
1212
+ },
1213
+ "mmlu_high_school_european_history": {
1214
+ "task": "mmlu_high_school_european_history",
1215
+ "task_alias": "high_school_european_history",
1216
+ "group": "mmlu_humanities",
1217
+ "group_alias": "humanities",
1218
+ "dataset_path": "hails/mmlu_no_train",
1219
+ "dataset_name": "high_school_european_history",
1220
+ "test_split": "test",
1221
+ "fewshot_split": "dev",
1222
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1223
+ "doc_to_target": "answer",
1224
+ "doc_to_choice": [
1225
+ "A",
1226
+ "B",
1227
+ "C",
1228
+ "D"
1229
+ ],
1230
+ "description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
1231
+ "target_delimiter": " ",
1232
+ "fewshot_delimiter": "\n\n",
1233
+ "fewshot_config": {
1234
+ "sampler": "first_n"
1235
+ },
1236
+ "num_fewshot": 5,
1237
+ "metric_list": [
1238
+ {
1239
+ "metric": "acc",
1240
+ "aggregation": "mean",
1241
+ "higher_is_better": true
1242
+ }
1243
+ ],
1244
+ "output_type": "multiple_choice",
1245
+ "repeats": 1,
1246
+ "should_decontaminate": false,
1247
+ "metadata": {
1248
+ "version": 0.0
1249
+ }
1250
+ },
1251
+ "mmlu_high_school_geography": {
1252
+ "task": "mmlu_high_school_geography",
1253
+ "task_alias": "high_school_geography",
1254
+ "group": "mmlu_social_sciences",
1255
+ "group_alias": "social_sciences",
1256
+ "dataset_path": "hails/mmlu_no_train",
1257
+ "dataset_name": "high_school_geography",
1258
+ "test_split": "test",
1259
+ "fewshot_split": "dev",
1260
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1261
+ "doc_to_target": "answer",
1262
+ "doc_to_choice": [
1263
+ "A",
1264
+ "B",
1265
+ "C",
1266
+ "D"
1267
+ ],
1268
+ "description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
1269
+ "target_delimiter": " ",
1270
+ "fewshot_delimiter": "\n\n",
1271
+ "fewshot_config": {
1272
+ "sampler": "first_n"
1273
+ },
1274
+ "num_fewshot": 5,
1275
+ "metric_list": [
1276
+ {
1277
+ "metric": "acc",
1278
+ "aggregation": "mean",
1279
+ "higher_is_better": true
1280
+ }
1281
+ ],
1282
+ "output_type": "multiple_choice",
1283
+ "repeats": 1,
1284
+ "should_decontaminate": false,
1285
+ "metadata": {
1286
+ "version": 0.0
1287
+ }
1288
+ },
1289
+ "mmlu_high_school_government_and_politics": {
1290
+ "task": "mmlu_high_school_government_and_politics",
1291
+ "task_alias": "high_school_government_and_politics",
1292
+ "group": "mmlu_social_sciences",
1293
+ "group_alias": "social_sciences",
1294
+ "dataset_path": "hails/mmlu_no_train",
1295
+ "dataset_name": "high_school_government_and_politics",
1296
+ "test_split": "test",
1297
+ "fewshot_split": "dev",
1298
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1299
+ "doc_to_target": "answer",
1300
+ "doc_to_choice": [
1301
+ "A",
1302
+ "B",
1303
+ "C",
1304
+ "D"
1305
+ ],
1306
+ "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
1307
+ "target_delimiter": " ",
1308
+ "fewshot_delimiter": "\n\n",
1309
+ "fewshot_config": {
1310
+ "sampler": "first_n"
1311
+ },
1312
+ "num_fewshot": 5,
1313
+ "metric_list": [
1314
+ {
1315
+ "metric": "acc",
1316
+ "aggregation": "mean",
1317
+ "higher_is_better": true
1318
+ }
1319
+ ],
1320
+ "output_type": "multiple_choice",
1321
+ "repeats": 1,
1322
+ "should_decontaminate": false,
1323
+ "metadata": {
1324
+ "version": 0.0
1325
+ }
1326
+ },
1327
+ "mmlu_high_school_macroeconomics": {
1328
+ "task": "mmlu_high_school_macroeconomics",
1329
+ "task_alias": "high_school_macroeconomics",
1330
+ "group": "mmlu_social_sciences",
1331
+ "group_alias": "social_sciences",
1332
+ "dataset_path": "hails/mmlu_no_train",
1333
+ "dataset_name": "high_school_macroeconomics",
1334
+ "test_split": "test",
1335
+ "fewshot_split": "dev",
1336
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1337
+ "doc_to_target": "answer",
1338
+ "doc_to_choice": [
1339
+ "A",
1340
+ "B",
1341
+ "C",
1342
+ "D"
1343
+ ],
1344
+ "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
1345
+ "target_delimiter": " ",
1346
+ "fewshot_delimiter": "\n\n",
1347
+ "fewshot_config": {
1348
+ "sampler": "first_n"
1349
+ },
1350
+ "num_fewshot": 5,
1351
+ "metric_list": [
1352
+ {
1353
+ "metric": "acc",
1354
+ "aggregation": "mean",
1355
+ "higher_is_better": true
1356
+ }
1357
+ ],
1358
+ "output_type": "multiple_choice",
1359
+ "repeats": 1,
1360
+ "should_decontaminate": false,
1361
+ "metadata": {
1362
+ "version": 0.0
1363
+ }
1364
+ },
1365
+ "mmlu_high_school_mathematics": {
1366
+ "task": "mmlu_high_school_mathematics",
1367
+ "task_alias": "high_school_mathematics",
1368
+ "group": "mmlu_stem",
1369
+ "group_alias": "stem",
1370
+ "dataset_path": "hails/mmlu_no_train",
1371
+ "dataset_name": "high_school_mathematics",
1372
+ "test_split": "test",
1373
+ "fewshot_split": "dev",
1374
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1375
+ "doc_to_target": "answer",
1376
+ "doc_to_choice": [
1377
+ "A",
1378
+ "B",
1379
+ "C",
1380
+ "D"
1381
+ ],
1382
+ "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
1383
+ "target_delimiter": " ",
1384
+ "fewshot_delimiter": "\n\n",
1385
+ "fewshot_config": {
1386
+ "sampler": "first_n"
1387
+ },
1388
+ "num_fewshot": 5,
1389
+ "metric_list": [
1390
+ {
1391
+ "metric": "acc",
1392
+ "aggregation": "mean",
1393
+ "higher_is_better": true
1394
+ }
1395
+ ],
1396
+ "output_type": "multiple_choice",
1397
+ "repeats": 1,
1398
+ "should_decontaminate": false,
1399
+ "metadata": {
1400
+ "version": 0.0
1401
+ }
1402
+ },
1403
+ "mmlu_high_school_microeconomics": {
1404
+ "task": "mmlu_high_school_microeconomics",
1405
+ "task_alias": "high_school_microeconomics",
1406
+ "group": "mmlu_social_sciences",
1407
+ "group_alias": "social_sciences",
1408
+ "dataset_path": "hails/mmlu_no_train",
1409
+ "dataset_name": "high_school_microeconomics",
1410
+ "test_split": "test",
1411
+ "fewshot_split": "dev",
1412
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1413
+ "doc_to_target": "answer",
1414
+ "doc_to_choice": [
1415
+ "A",
1416
+ "B",
1417
+ "C",
1418
+ "D"
1419
+ ],
1420
+ "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
1421
+ "target_delimiter": " ",
1422
+ "fewshot_delimiter": "\n\n",
1423
+ "fewshot_config": {
1424
+ "sampler": "first_n"
1425
+ },
1426
+ "num_fewshot": 5,
1427
+ "metric_list": [
1428
+ {
1429
+ "metric": "acc",
1430
+ "aggregation": "mean",
1431
+ "higher_is_better": true
1432
+ }
1433
+ ],
1434
+ "output_type": "multiple_choice",
1435
+ "repeats": 1,
1436
+ "should_decontaminate": false,
1437
+ "metadata": {
1438
+ "version": 0.0
1439
+ }
1440
+ },
1441
+ "mmlu_high_school_physics": {
1442
+ "task": "mmlu_high_school_physics",
1443
+ "task_alias": "high_school_physics",
1444
+ "group": "mmlu_stem",
1445
+ "group_alias": "stem",
1446
+ "dataset_path": "hails/mmlu_no_train",
1447
+ "dataset_name": "high_school_physics",
1448
+ "test_split": "test",
1449
+ "fewshot_split": "dev",
1450
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1451
+ "doc_to_target": "answer",
1452
+ "doc_to_choice": [
1453
+ "A",
1454
+ "B",
1455
+ "C",
1456
+ "D"
1457
+ ],
1458
+ "description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
1459
+ "target_delimiter": " ",
1460
+ "fewshot_delimiter": "\n\n",
1461
+ "fewshot_config": {
1462
+ "sampler": "first_n"
1463
+ },
1464
+ "num_fewshot": 5,
1465
+ "metric_list": [
1466
+ {
1467
+ "metric": "acc",
1468
+ "aggregation": "mean",
1469
+ "higher_is_better": true
1470
+ }
1471
+ ],
1472
+ "output_type": "multiple_choice",
1473
+ "repeats": 1,
1474
+ "should_decontaminate": false,
1475
+ "metadata": {
1476
+ "version": 0.0
1477
+ }
1478
+ },
1479
+ "mmlu_high_school_psychology": {
1480
+ "task": "mmlu_high_school_psychology",
1481
+ "task_alias": "high_school_psychology",
1482
+ "group": "mmlu_social_sciences",
1483
+ "group_alias": "social_sciences",
1484
+ "dataset_path": "hails/mmlu_no_train",
1485
+ "dataset_name": "high_school_psychology",
1486
+ "test_split": "test",
1487
+ "fewshot_split": "dev",
1488
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1489
+ "doc_to_target": "answer",
1490
+ "doc_to_choice": [
1491
+ "A",
1492
+ "B",
1493
+ "C",
1494
+ "D"
1495
+ ],
1496
+ "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
1497
+ "target_delimiter": " ",
1498
+ "fewshot_delimiter": "\n\n",
1499
+ "fewshot_config": {
1500
+ "sampler": "first_n"
1501
+ },
1502
+ "num_fewshot": 5,
1503
+ "metric_list": [
1504
+ {
1505
+ "metric": "acc",
1506
+ "aggregation": "mean",
1507
+ "higher_is_better": true
1508
+ }
1509
+ ],
1510
+ "output_type": "multiple_choice",
1511
+ "repeats": 1,
1512
+ "should_decontaminate": false,
1513
+ "metadata": {
1514
+ "version": 0.0
1515
+ }
1516
+ },
1517
+ "mmlu_high_school_statistics": {
1518
+ "task": "mmlu_high_school_statistics",
1519
+ "task_alias": "high_school_statistics",
1520
+ "group": "mmlu_stem",
1521
+ "group_alias": "stem",
1522
+ "dataset_path": "hails/mmlu_no_train",
1523
+ "dataset_name": "high_school_statistics",
1524
+ "test_split": "test",
1525
+ "fewshot_split": "dev",
1526
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1527
+ "doc_to_target": "answer",
1528
+ "doc_to_choice": [
1529
+ "A",
1530
+ "B",
1531
+ "C",
1532
+ "D"
1533
+ ],
1534
+ "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
1535
+ "target_delimiter": " ",
1536
+ "fewshot_delimiter": "\n\n",
1537
+ "fewshot_config": {
1538
+ "sampler": "first_n"
1539
+ },
1540
+ "num_fewshot": 5,
1541
+ "metric_list": [
1542
+ {
1543
+ "metric": "acc",
1544
+ "aggregation": "mean",
1545
+ "higher_is_better": true
1546
+ }
1547
+ ],
1548
+ "output_type": "multiple_choice",
1549
+ "repeats": 1,
1550
+ "should_decontaminate": false,
1551
+ "metadata": {
1552
+ "version": 0.0
1553
+ }
1554
+ },
1555
+ "mmlu_high_school_us_history": {
1556
+ "task": "mmlu_high_school_us_history",
1557
+ "task_alias": "high_school_us_history",
1558
+ "group": "mmlu_humanities",
1559
+ "group_alias": "humanities",
1560
+ "dataset_path": "hails/mmlu_no_train",
1561
+ "dataset_name": "high_school_us_history",
1562
+ "test_split": "test",
1563
+ "fewshot_split": "dev",
1564
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1565
+ "doc_to_target": "answer",
1566
+ "doc_to_choice": [
1567
+ "A",
1568
+ "B",
1569
+ "C",
1570
+ "D"
1571
+ ],
1572
+ "description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
1573
+ "target_delimiter": " ",
1574
+ "fewshot_delimiter": "\n\n",
1575
+ "fewshot_config": {
1576
+ "sampler": "first_n"
1577
+ },
1578
+ "num_fewshot": 5,
1579
+ "metric_list": [
1580
+ {
1581
+ "metric": "acc",
1582
+ "aggregation": "mean",
1583
+ "higher_is_better": true
1584
+ }
1585
+ ],
1586
+ "output_type": "multiple_choice",
1587
+ "repeats": 1,
1588
+ "should_decontaminate": false,
1589
+ "metadata": {
1590
+ "version": 0.0
1591
+ }
1592
+ },
1593
+ "mmlu_high_school_world_history": {
1594
+ "task": "mmlu_high_school_world_history",
1595
+ "task_alias": "high_school_world_history",
1596
+ "group": "mmlu_humanities",
1597
+ "group_alias": "humanities",
1598
+ "dataset_path": "hails/mmlu_no_train",
1599
+ "dataset_name": "high_school_world_history",
1600
+ "test_split": "test",
1601
+ "fewshot_split": "dev",
1602
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1603
+ "doc_to_target": "answer",
1604
+ "doc_to_choice": [
1605
+ "A",
1606
+ "B",
1607
+ "C",
1608
+ "D"
1609
+ ],
1610
+ "description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
1611
+ "target_delimiter": " ",
1612
+ "fewshot_delimiter": "\n\n",
1613
+ "fewshot_config": {
1614
+ "sampler": "first_n"
1615
+ },
1616
+ "num_fewshot": 5,
1617
+ "metric_list": [
1618
+ {
1619
+ "metric": "acc",
1620
+ "aggregation": "mean",
1621
+ "higher_is_better": true
1622
+ }
1623
+ ],
1624
+ "output_type": "multiple_choice",
1625
+ "repeats": 1,
1626
+ "should_decontaminate": false,
1627
+ "metadata": {
1628
+ "version": 0.0
1629
+ }
1630
+ },
1631
+ "mmlu_human_aging": {
1632
+ "task": "mmlu_human_aging",
1633
+ "task_alias": "human_aging",
1634
+ "group": "mmlu_other",
1635
+ "group_alias": "other",
1636
+ "dataset_path": "hails/mmlu_no_train",
1637
+ "dataset_name": "human_aging",
1638
+ "test_split": "test",
1639
+ "fewshot_split": "dev",
1640
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1641
+ "doc_to_target": "answer",
1642
+ "doc_to_choice": [
1643
+ "A",
1644
+ "B",
1645
+ "C",
1646
+ "D"
1647
+ ],
1648
+ "description": "The following are multiple choice questions (with answers) about human aging.\n\n",
1649
+ "target_delimiter": " ",
1650
+ "fewshot_delimiter": "\n\n",
1651
+ "fewshot_config": {
1652
+ "sampler": "first_n"
1653
+ },
1654
+ "num_fewshot": 5,
1655
+ "metric_list": [
1656
+ {
1657
+ "metric": "acc",
1658
+ "aggregation": "mean",
1659
+ "higher_is_better": true
1660
+ }
1661
+ ],
1662
+ "output_type": "multiple_choice",
1663
+ "repeats": 1,
1664
+ "should_decontaminate": false,
1665
+ "metadata": {
1666
+ "version": 0.0
1667
+ }
1668
+ },
1669
+ "mmlu_human_sexuality": {
1670
+ "task": "mmlu_human_sexuality",
1671
+ "task_alias": "human_sexuality",
1672
+ "group": "mmlu_social_sciences",
1673
+ "group_alias": "social_sciences",
1674
+ "dataset_path": "hails/mmlu_no_train",
1675
+ "dataset_name": "human_sexuality",
1676
+ "test_split": "test",
1677
+ "fewshot_split": "dev",
1678
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1679
+ "doc_to_target": "answer",
1680
+ "doc_to_choice": [
1681
+ "A",
1682
+ "B",
1683
+ "C",
1684
+ "D"
1685
+ ],
1686
+ "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
1687
+ "target_delimiter": " ",
1688
+ "fewshot_delimiter": "\n\n",
1689
+ "fewshot_config": {
1690
+ "sampler": "first_n"
1691
+ },
1692
+ "num_fewshot": 5,
1693
+ "metric_list": [
1694
+ {
1695
+ "metric": "acc",
1696
+ "aggregation": "mean",
1697
+ "higher_is_better": true
1698
+ }
1699
+ ],
1700
+ "output_type": "multiple_choice",
1701
+ "repeats": 1,
1702
+ "should_decontaminate": false,
1703
+ "metadata": {
1704
+ "version": 0.0
1705
+ }
1706
+ },
1707
+ "mmlu_international_law": {
1708
+ "task": "mmlu_international_law",
1709
+ "task_alias": "international_law",
1710
+ "group": "mmlu_humanities",
1711
+ "group_alias": "humanities",
1712
+ "dataset_path": "hails/mmlu_no_train",
1713
+ "dataset_name": "international_law",
1714
+ "test_split": "test",
1715
+ "fewshot_split": "dev",
1716
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1717
+ "doc_to_target": "answer",
1718
+ "doc_to_choice": [
1719
+ "A",
1720
+ "B",
1721
+ "C",
1722
+ "D"
1723
+ ],
1724
+ "description": "The following are multiple choice questions (with answers) about international law.\n\n",
1725
+ "target_delimiter": " ",
1726
+ "fewshot_delimiter": "\n\n",
1727
+ "fewshot_config": {
1728
+ "sampler": "first_n"
1729
+ },
1730
+ "num_fewshot": 5,
1731
+ "metric_list": [
1732
+ {
1733
+ "metric": "acc",
1734
+ "aggregation": "mean",
1735
+ "higher_is_better": true
1736
+ }
1737
+ ],
1738
+ "output_type": "multiple_choice",
1739
+ "repeats": 1,
1740
+ "should_decontaminate": false,
1741
+ "metadata": {
1742
+ "version": 0.0
1743
+ }
1744
+ },
1745
+ "mmlu_jurisprudence": {
1746
+ "task": "mmlu_jurisprudence",
1747
+ "task_alias": "jurisprudence",
1748
+ "group": "mmlu_humanities",
1749
+ "group_alias": "humanities",
1750
+ "dataset_path": "hails/mmlu_no_train",
1751
+ "dataset_name": "jurisprudence",
1752
+ "test_split": "test",
1753
+ "fewshot_split": "dev",
1754
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1755
+ "doc_to_target": "answer",
1756
+ "doc_to_choice": [
1757
+ "A",
1758
+ "B",
1759
+ "C",
1760
+ "D"
1761
+ ],
1762
+ "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
1763
+ "target_delimiter": " ",
1764
+ "fewshot_delimiter": "\n\n",
1765
+ "fewshot_config": {
1766
+ "sampler": "first_n"
1767
+ },
1768
+ "num_fewshot": 5,
1769
+ "metric_list": [
1770
+ {
1771
+ "metric": "acc",
1772
+ "aggregation": "mean",
1773
+ "higher_is_better": true
1774
+ }
1775
+ ],
1776
+ "output_type": "multiple_choice",
1777
+ "repeats": 1,
1778
+ "should_decontaminate": false,
1779
+ "metadata": {
1780
+ "version": 0.0
1781
+ }
1782
+ },
1783
+ "mmlu_logical_fallacies": {
1784
+ "task": "mmlu_logical_fallacies",
1785
+ "task_alias": "logical_fallacies",
1786
+ "group": "mmlu_humanities",
1787
+ "group_alias": "humanities",
1788
+ "dataset_path": "hails/mmlu_no_train",
1789
+ "dataset_name": "logical_fallacies",
1790
+ "test_split": "test",
1791
+ "fewshot_split": "dev",
1792
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1793
+ "doc_to_target": "answer",
1794
+ "doc_to_choice": [
1795
+ "A",
1796
+ "B",
1797
+ "C",
1798
+ "D"
1799
+ ],
1800
+ "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
1801
+ "target_delimiter": " ",
1802
+ "fewshot_delimiter": "\n\n",
1803
+ "fewshot_config": {
1804
+ "sampler": "first_n"
1805
+ },
1806
+ "num_fewshot": 5,
1807
+ "metric_list": [
1808
+ {
1809
+ "metric": "acc",
1810
+ "aggregation": "mean",
1811
+ "higher_is_better": true
1812
+ }
1813
+ ],
1814
+ "output_type": "multiple_choice",
1815
+ "repeats": 1,
1816
+ "should_decontaminate": false,
1817
+ "metadata": {
1818
+ "version": 0.0
1819
+ }
1820
+ },
1821
+ "mmlu_machine_learning": {
1822
+ "task": "mmlu_machine_learning",
1823
+ "task_alias": "machine_learning",
1824
+ "group": "mmlu_stem",
1825
+ "group_alias": "stem",
1826
+ "dataset_path": "hails/mmlu_no_train",
1827
+ "dataset_name": "machine_learning",
1828
+ "test_split": "test",
1829
+ "fewshot_split": "dev",
1830
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1831
+ "doc_to_target": "answer",
1832
+ "doc_to_choice": [
1833
+ "A",
1834
+ "B",
1835
+ "C",
1836
+ "D"
1837
+ ],
1838
+ "description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
1839
+ "target_delimiter": " ",
1840
+ "fewshot_delimiter": "\n\n",
1841
+ "fewshot_config": {
1842
+ "sampler": "first_n"
1843
+ },
1844
+ "num_fewshot": 5,
1845
+ "metric_list": [
1846
+ {
1847
+ "metric": "acc",
1848
+ "aggregation": "mean",
1849
+ "higher_is_better": true
1850
+ }
1851
+ ],
1852
+ "output_type": "multiple_choice",
1853
+ "repeats": 1,
1854
+ "should_decontaminate": false,
1855
+ "metadata": {
1856
+ "version": 0.0
1857
+ }
1858
+ },
1859
+ "mmlu_management": {
1860
+ "task": "mmlu_management",
1861
+ "task_alias": "management",
1862
+ "group": "mmlu_other",
1863
+ "group_alias": "other",
1864
+ "dataset_path": "hails/mmlu_no_train",
1865
+ "dataset_name": "management",
1866
+ "test_split": "test",
1867
+ "fewshot_split": "dev",
1868
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1869
+ "doc_to_target": "answer",
1870
+ "doc_to_choice": [
1871
+ "A",
1872
+ "B",
1873
+ "C",
1874
+ "D"
1875
+ ],
1876
+ "description": "The following are multiple choice questions (with answers) about management.\n\n",
1877
+ "target_delimiter": " ",
1878
+ "fewshot_delimiter": "\n\n",
1879
+ "fewshot_config": {
1880
+ "sampler": "first_n"
1881
+ },
1882
+ "num_fewshot": 5,
1883
+ "metric_list": [
1884
+ {
1885
+ "metric": "acc",
1886
+ "aggregation": "mean",
1887
+ "higher_is_better": true
1888
+ }
1889
+ ],
1890
+ "output_type": "multiple_choice",
1891
+ "repeats": 1,
1892
+ "should_decontaminate": false,
1893
+ "metadata": {
1894
+ "version": 0.0
1895
+ }
1896
+ },
1897
+ "mmlu_marketing": {
1898
+ "task": "mmlu_marketing",
1899
+ "task_alias": "marketing",
1900
+ "group": "mmlu_other",
1901
+ "group_alias": "other",
1902
+ "dataset_path": "hails/mmlu_no_train",
1903
+ "dataset_name": "marketing",
1904
+ "test_split": "test",
1905
+ "fewshot_split": "dev",
1906
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1907
+ "doc_to_target": "answer",
1908
+ "doc_to_choice": [
1909
+ "A",
1910
+ "B",
1911
+ "C",
1912
+ "D"
1913
+ ],
1914
+ "description": "The following are multiple choice questions (with answers) about marketing.\n\n",
1915
+ "target_delimiter": " ",
1916
+ "fewshot_delimiter": "\n\n",
1917
+ "fewshot_config": {
1918
+ "sampler": "first_n"
1919
+ },
1920
+ "num_fewshot": 5,
1921
+ "metric_list": [
1922
+ {
1923
+ "metric": "acc",
1924
+ "aggregation": "mean",
1925
+ "higher_is_better": true
1926
+ }
1927
+ ],
1928
+ "output_type": "multiple_choice",
1929
+ "repeats": 1,
1930
+ "should_decontaminate": false,
1931
+ "metadata": {
1932
+ "version": 0.0
1933
+ }
1934
+ },
1935
+ "mmlu_medical_genetics": {
1936
+ "task": "mmlu_medical_genetics",
1937
+ "task_alias": "medical_genetics",
1938
+ "group": "mmlu_other",
1939
+ "group_alias": "other",
1940
+ "dataset_path": "hails/mmlu_no_train",
1941
+ "dataset_name": "medical_genetics",
1942
+ "test_split": "test",
1943
+ "fewshot_split": "dev",
1944
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1945
+ "doc_to_target": "answer",
1946
+ "doc_to_choice": [
1947
+ "A",
1948
+ "B",
1949
+ "C",
1950
+ "D"
1951
+ ],
1952
+ "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
1953
+ "target_delimiter": " ",
1954
+ "fewshot_delimiter": "\n\n",
1955
+ "fewshot_config": {
1956
+ "sampler": "first_n"
1957
+ },
1958
+ "num_fewshot": 5,
1959
+ "metric_list": [
1960
+ {
1961
+ "metric": "acc",
1962
+ "aggregation": "mean",
1963
+ "higher_is_better": true
1964
+ }
1965
+ ],
1966
+ "output_type": "multiple_choice",
1967
+ "repeats": 1,
1968
+ "should_decontaminate": false,
1969
+ "metadata": {
1970
+ "version": 0.0
1971
+ }
1972
+ },
1973
+ "mmlu_miscellaneous": {
1974
+ "task": "mmlu_miscellaneous",
1975
+ "task_alias": "miscellaneous",
1976
+ "group": "mmlu_other",
1977
+ "group_alias": "other",
1978
+ "dataset_path": "hails/mmlu_no_train",
1979
+ "dataset_name": "miscellaneous",
1980
+ "test_split": "test",
1981
+ "fewshot_split": "dev",
1982
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1983
+ "doc_to_target": "answer",
1984
+ "doc_to_choice": [
1985
+ "A",
1986
+ "B",
1987
+ "C",
1988
+ "D"
1989
+ ],
1990
+ "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
1991
+ "target_delimiter": " ",
1992
+ "fewshot_delimiter": "\n\n",
1993
+ "fewshot_config": {
1994
+ "sampler": "first_n"
1995
+ },
1996
+ "num_fewshot": 5,
1997
+ "metric_list": [
1998
+ {
1999
+ "metric": "acc",
2000
+ "aggregation": "mean",
2001
+ "higher_is_better": true
2002
+ }
2003
+ ],
2004
+ "output_type": "multiple_choice",
2005
+ "repeats": 1,
2006
+ "should_decontaminate": false,
2007
+ "metadata": {
2008
+ "version": 0.0
2009
+ }
2010
+ },
2011
+ "mmlu_moral_disputes": {
2012
+ "task": "mmlu_moral_disputes",
2013
+ "task_alias": "moral_disputes",
2014
+ "group": "mmlu_humanities",
2015
+ "group_alias": "humanities",
2016
+ "dataset_path": "hails/mmlu_no_train",
2017
+ "dataset_name": "moral_disputes",
2018
+ "test_split": "test",
2019
+ "fewshot_split": "dev",
2020
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2021
+ "doc_to_target": "answer",
2022
+ "doc_to_choice": [
2023
+ "A",
2024
+ "B",
2025
+ "C",
2026
+ "D"
2027
+ ],
2028
+ "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
2029
+ "target_delimiter": " ",
2030
+ "fewshot_delimiter": "\n\n",
2031
+ "fewshot_config": {
2032
+ "sampler": "first_n"
2033
+ },
2034
+ "num_fewshot": 5,
2035
+ "metric_list": [
2036
+ {
2037
+ "metric": "acc",
2038
+ "aggregation": "mean",
2039
+ "higher_is_better": true
2040
+ }
2041
+ ],
2042
+ "output_type": "multiple_choice",
2043
+ "repeats": 1,
2044
+ "should_decontaminate": false,
2045
+ "metadata": {
2046
+ "version": 0.0
2047
+ }
2048
+ },
2049
+ "mmlu_moral_scenarios": {
2050
+ "task": "mmlu_moral_scenarios",
2051
+ "task_alias": "moral_scenarios",
2052
+ "group": "mmlu_humanities",
2053
+ "group_alias": "humanities",
2054
+ "dataset_path": "hails/mmlu_no_train",
2055
+ "dataset_name": "moral_scenarios",
2056
+ "test_split": "test",
2057
+ "fewshot_split": "dev",
2058
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2059
+ "doc_to_target": "answer",
2060
+ "doc_to_choice": [
2061
+ "A",
2062
+ "B",
2063
+ "C",
2064
+ "D"
2065
+ ],
2066
+ "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
2067
+ "target_delimiter": " ",
2068
+ "fewshot_delimiter": "\n\n",
2069
+ "fewshot_config": {
2070
+ "sampler": "first_n"
2071
+ },
2072
+ "num_fewshot": 5,
2073
+ "metric_list": [
2074
+ {
2075
+ "metric": "acc",
2076
+ "aggregation": "mean",
2077
+ "higher_is_better": true
2078
+ }
2079
+ ],
2080
+ "output_type": "multiple_choice",
2081
+ "repeats": 1,
2082
+ "should_decontaminate": false,
2083
+ "metadata": {
2084
+ "version": 0.0
2085
+ }
2086
+ },
2087
+ "mmlu_nutrition": {
2088
+ "task": "mmlu_nutrition",
2089
+ "task_alias": "nutrition",
2090
+ "group": "mmlu_other",
2091
+ "group_alias": "other",
2092
+ "dataset_path": "hails/mmlu_no_train",
2093
+ "dataset_name": "nutrition",
2094
+ "test_split": "test",
2095
+ "fewshot_split": "dev",
2096
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2097
+ "doc_to_target": "answer",
2098
+ "doc_to_choice": [
2099
+ "A",
2100
+ "B",
2101
+ "C",
2102
+ "D"
2103
+ ],
2104
+ "description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
2105
+ "target_delimiter": " ",
2106
+ "fewshot_delimiter": "\n\n",
2107
+ "fewshot_config": {
2108
+ "sampler": "first_n"
2109
+ },
2110
+ "num_fewshot": 5,
2111
+ "metric_list": [
2112
+ {
2113
+ "metric": "acc",
2114
+ "aggregation": "mean",
2115
+ "higher_is_better": true
2116
+ }
2117
+ ],
2118
+ "output_type": "multiple_choice",
2119
+ "repeats": 1,
2120
+ "should_decontaminate": false,
2121
+ "metadata": {
2122
+ "version": 0.0
2123
+ }
2124
+ },
2125
+ "mmlu_philosophy": {
2126
+ "task": "mmlu_philosophy",
2127
+ "task_alias": "philosophy",
2128
+ "group": "mmlu_humanities",
2129
+ "group_alias": "humanities",
2130
+ "dataset_path": "hails/mmlu_no_train",
2131
+ "dataset_name": "philosophy",
2132
+ "test_split": "test",
2133
+ "fewshot_split": "dev",
2134
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2135
+ "doc_to_target": "answer",
2136
+ "doc_to_choice": [
2137
+ "A",
2138
+ "B",
2139
+ "C",
2140
+ "D"
2141
+ ],
2142
+ "description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
2143
+ "target_delimiter": " ",
2144
+ "fewshot_delimiter": "\n\n",
2145
+ "fewshot_config": {
2146
+ "sampler": "first_n"
2147
+ },
2148
+ "num_fewshot": 5,
2149
+ "metric_list": [
2150
+ {
2151
+ "metric": "acc",
2152
+ "aggregation": "mean",
2153
+ "higher_is_better": true
2154
+ }
2155
+ ],
2156
+ "output_type": "multiple_choice",
2157
+ "repeats": 1,
2158
+ "should_decontaminate": false,
2159
+ "metadata": {
2160
+ "version": 0.0
2161
+ }
2162
+ },
2163
+ "mmlu_prehistory": {
2164
+ "task": "mmlu_prehistory",
2165
+ "task_alias": "prehistory",
2166
+ "group": "mmlu_humanities",
2167
+ "group_alias": "humanities",
2168
+ "dataset_path": "hails/mmlu_no_train",
2169
+ "dataset_name": "prehistory",
2170
+ "test_split": "test",
2171
+ "fewshot_split": "dev",
2172
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2173
+ "doc_to_target": "answer",
2174
+ "doc_to_choice": [
2175
+ "A",
2176
+ "B",
2177
+ "C",
2178
+ "D"
2179
+ ],
2180
+ "description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
2181
+ "target_delimiter": " ",
2182
+ "fewshot_delimiter": "\n\n",
2183
+ "fewshot_config": {
2184
+ "sampler": "first_n"
2185
+ },
2186
+ "num_fewshot": 5,
2187
+ "metric_list": [
2188
+ {
2189
+ "metric": "acc",
2190
+ "aggregation": "mean",
2191
+ "higher_is_better": true
2192
+ }
2193
+ ],
2194
+ "output_type": "multiple_choice",
2195
+ "repeats": 1,
2196
+ "should_decontaminate": false,
2197
+ "metadata": {
2198
+ "version": 0.0
2199
+ }
2200
+ },
2201
+ "mmlu_professional_accounting": {
2202
+ "task": "mmlu_professional_accounting",
2203
+ "task_alias": "professional_accounting",
2204
+ "group": "mmlu_other",
2205
+ "group_alias": "other",
2206
+ "dataset_path": "hails/mmlu_no_train",
2207
+ "dataset_name": "professional_accounting",
2208
+ "test_split": "test",
2209
+ "fewshot_split": "dev",
2210
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2211
+ "doc_to_target": "answer",
2212
+ "doc_to_choice": [
2213
+ "A",
2214
+ "B",
2215
+ "C",
2216
+ "D"
2217
+ ],
2218
+ "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
2219
+ "target_delimiter": " ",
2220
+ "fewshot_delimiter": "\n\n",
2221
+ "fewshot_config": {
2222
+ "sampler": "first_n"
2223
+ },
2224
+ "num_fewshot": 5,
2225
+ "metric_list": [
2226
+ {
2227
+ "metric": "acc",
2228
+ "aggregation": "mean",
2229
+ "higher_is_better": true
2230
+ }
2231
+ ],
2232
+ "output_type": "multiple_choice",
2233
+ "repeats": 1,
2234
+ "should_decontaminate": false,
2235
+ "metadata": {
2236
+ "version": 0.0
2237
+ }
2238
+ },
2239
+ "mmlu_professional_law": {
2240
+ "task": "mmlu_professional_law",
2241
+ "task_alias": "professional_law",
2242
+ "group": "mmlu_humanities",
2243
+ "group_alias": "humanities",
2244
+ "dataset_path": "hails/mmlu_no_train",
2245
+ "dataset_name": "professional_law",
2246
+ "test_split": "test",
2247
+ "fewshot_split": "dev",
2248
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2249
+ "doc_to_target": "answer",
2250
+ "doc_to_choice": [
2251
+ "A",
2252
+ "B",
2253
+ "C",
2254
+ "D"
2255
+ ],
2256
+ "description": "The following are multiple choice questions (with answers) about professional law.\n\n",
2257
+ "target_delimiter": " ",
2258
+ "fewshot_delimiter": "\n\n",
2259
+ "fewshot_config": {
2260
+ "sampler": "first_n"
2261
+ },
2262
+ "num_fewshot": 5,
2263
+ "metric_list": [
2264
+ {
2265
+ "metric": "acc",
2266
+ "aggregation": "mean",
2267
+ "higher_is_better": true
2268
+ }
2269
+ ],
2270
+ "output_type": "multiple_choice",
2271
+ "repeats": 1,
2272
+ "should_decontaminate": false,
2273
+ "metadata": {
2274
+ "version": 0.0
2275
+ }
2276
+ },
2277
+ "mmlu_professional_medicine": {
2278
+ "task": "mmlu_professional_medicine",
2279
+ "task_alias": "professional_medicine",
2280
+ "group": "mmlu_other",
2281
+ "group_alias": "other",
2282
+ "dataset_path": "hails/mmlu_no_train",
2283
+ "dataset_name": "professional_medicine",
2284
+ "test_split": "test",
2285
+ "fewshot_split": "dev",
2286
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2287
+ "doc_to_target": "answer",
2288
+ "doc_to_choice": [
2289
+ "A",
2290
+ "B",
2291
+ "C",
2292
+ "D"
2293
+ ],
2294
+ "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
2295
+ "target_delimiter": " ",
2296
+ "fewshot_delimiter": "\n\n",
2297
+ "fewshot_config": {
2298
+ "sampler": "first_n"
2299
+ },
2300
+ "num_fewshot": 5,
2301
+ "metric_list": [
2302
+ {
2303
+ "metric": "acc",
2304
+ "aggregation": "mean",
2305
+ "higher_is_better": true
2306
+ }
2307
+ ],
2308
+ "output_type": "multiple_choice",
2309
+ "repeats": 1,
2310
+ "should_decontaminate": false,
2311
+ "metadata": {
2312
+ "version": 0.0
2313
+ }
2314
+ },
2315
+ "mmlu_professional_psychology": {
2316
+ "task": "mmlu_professional_psychology",
2317
+ "task_alias": "professional_psychology",
2318
+ "group": "mmlu_social_sciences",
2319
+ "group_alias": "social_sciences",
2320
+ "dataset_path": "hails/mmlu_no_train",
2321
+ "dataset_name": "professional_psychology",
2322
+ "test_split": "test",
2323
+ "fewshot_split": "dev",
2324
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2325
+ "doc_to_target": "answer",
2326
+ "doc_to_choice": [
2327
+ "A",
2328
+ "B",
2329
+ "C",
2330
+ "D"
2331
+ ],
2332
+ "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
2333
+ "target_delimiter": " ",
2334
+ "fewshot_delimiter": "\n\n",
2335
+ "fewshot_config": {
2336
+ "sampler": "first_n"
2337
+ },
2338
+ "num_fewshot": 5,
2339
+ "metric_list": [
2340
+ {
2341
+ "metric": "acc",
2342
+ "aggregation": "mean",
2343
+ "higher_is_better": true
2344
+ }
2345
+ ],
2346
+ "output_type": "multiple_choice",
2347
+ "repeats": 1,
2348
+ "should_decontaminate": false,
2349
+ "metadata": {
2350
+ "version": 0.0
2351
+ }
2352
+ },
2353
+ "mmlu_public_relations": {
2354
+ "task": "mmlu_public_relations",
2355
+ "task_alias": "public_relations",
2356
+ "group": "mmlu_social_sciences",
2357
+ "group_alias": "social_sciences",
2358
+ "dataset_path": "hails/mmlu_no_train",
2359
+ "dataset_name": "public_relations",
2360
+ "test_split": "test",
2361
+ "fewshot_split": "dev",
2362
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2363
+ "doc_to_target": "answer",
2364
+ "doc_to_choice": [
2365
+ "A",
2366
+ "B",
2367
+ "C",
2368
+ "D"
2369
+ ],
2370
+ "description": "The following are multiple choice questions (with answers) about public relations.\n\n",
2371
+ "target_delimiter": " ",
2372
+ "fewshot_delimiter": "\n\n",
2373
+ "fewshot_config": {
2374
+ "sampler": "first_n"
2375
+ },
2376
+ "num_fewshot": 5,
2377
+ "metric_list": [
2378
+ {
2379
+ "metric": "acc",
2380
+ "aggregation": "mean",
2381
+ "higher_is_better": true
2382
+ }
2383
+ ],
2384
+ "output_type": "multiple_choice",
2385
+ "repeats": 1,
2386
+ "should_decontaminate": false,
2387
+ "metadata": {
2388
+ "version": 0.0
2389
+ }
2390
+ },
2391
+ "mmlu_security_studies": {
2392
+ "task": "mmlu_security_studies",
2393
+ "task_alias": "security_studies",
2394
+ "group": "mmlu_social_sciences",
2395
+ "group_alias": "social_sciences",
2396
+ "dataset_path": "hails/mmlu_no_train",
2397
+ "dataset_name": "security_studies",
2398
+ "test_split": "test",
2399
+ "fewshot_split": "dev",
2400
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2401
+ "doc_to_target": "answer",
2402
+ "doc_to_choice": [
2403
+ "A",
2404
+ "B",
2405
+ "C",
2406
+ "D"
2407
+ ],
2408
+ "description": "The following are multiple choice questions (with answers) about security studies.\n\n",
2409
+ "target_delimiter": " ",
2410
+ "fewshot_delimiter": "\n\n",
2411
+ "fewshot_config": {
2412
+ "sampler": "first_n"
2413
+ },
2414
+ "num_fewshot": 5,
2415
+ "metric_list": [
2416
+ {
2417
+ "metric": "acc",
2418
+ "aggregation": "mean",
2419
+ "higher_is_better": true
2420
+ }
2421
+ ],
2422
+ "output_type": "multiple_choice",
2423
+ "repeats": 1,
2424
+ "should_decontaminate": false,
2425
+ "metadata": {
2426
+ "version": 0.0
2427
+ }
2428
+ },
2429
+ "mmlu_sociology": {
2430
+ "task": "mmlu_sociology",
2431
+ "task_alias": "sociology",
2432
+ "group": "mmlu_social_sciences",
2433
+ "group_alias": "social_sciences",
2434
+ "dataset_path": "hails/mmlu_no_train",
2435
+ "dataset_name": "sociology",
2436
+ "test_split": "test",
2437
+ "fewshot_split": "dev",
2438
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2439
+ "doc_to_target": "answer",
2440
+ "doc_to_choice": [
2441
+ "A",
2442
+ "B",
2443
+ "C",
2444
+ "D"
2445
+ ],
2446
+ "description": "The following are multiple choice questions (with answers) about sociology.\n\n",
2447
+ "target_delimiter": " ",
2448
+ "fewshot_delimiter": "\n\n",
2449
+ "fewshot_config": {
2450
+ "sampler": "first_n"
2451
+ },
2452
+ "num_fewshot": 5,
2453
+ "metric_list": [
2454
+ {
2455
+ "metric": "acc",
2456
+ "aggregation": "mean",
2457
+ "higher_is_better": true
2458
+ }
2459
+ ],
2460
+ "output_type": "multiple_choice",
2461
+ "repeats": 1,
2462
+ "should_decontaminate": false,
2463
+ "metadata": {
2464
+ "version": 0.0
2465
+ }
2466
+ },
2467
+ "mmlu_us_foreign_policy": {
2468
+ "task": "mmlu_us_foreign_policy",
2469
+ "task_alias": "us_foreign_policy",
2470
+ "group": "mmlu_social_sciences",
2471
+ "group_alias": "social_sciences",
2472
+ "dataset_path": "hails/mmlu_no_train",
2473
+ "dataset_name": "us_foreign_policy",
2474
+ "test_split": "test",
2475
+ "fewshot_split": "dev",
2476
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2477
+ "doc_to_target": "answer",
2478
+ "doc_to_choice": [
2479
+ "A",
2480
+ "B",
2481
+ "C",
2482
+ "D"
2483
+ ],
2484
+ "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
2485
+ "target_delimiter": " ",
2486
+ "fewshot_delimiter": "\n\n",
2487
+ "fewshot_config": {
2488
+ "sampler": "first_n"
2489
+ },
2490
+ "num_fewshot": 5,
2491
+ "metric_list": [
2492
+ {
2493
+ "metric": "acc",
2494
+ "aggregation": "mean",
2495
+ "higher_is_better": true
2496
+ }
2497
+ ],
2498
+ "output_type": "multiple_choice",
2499
+ "repeats": 1,
2500
+ "should_decontaminate": false,
2501
+ "metadata": {
2502
+ "version": 0.0
2503
+ }
2504
+ },
2505
+ "mmlu_virology": {
2506
+ "task": "mmlu_virology",
2507
+ "task_alias": "virology",
2508
+ "group": "mmlu_other",
2509
+ "group_alias": "other",
2510
+ "dataset_path": "hails/mmlu_no_train",
2511
+ "dataset_name": "virology",
2512
+ "test_split": "test",
2513
+ "fewshot_split": "dev",
2514
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2515
+ "doc_to_target": "answer",
2516
+ "doc_to_choice": [
2517
+ "A",
2518
+ "B",
2519
+ "C",
2520
+ "D"
2521
+ ],
2522
+ "description": "The following are multiple choice questions (with answers) about virology.\n\n",
2523
+ "target_delimiter": " ",
2524
+ "fewshot_delimiter": "\n\n",
2525
+ "fewshot_config": {
2526
+ "sampler": "first_n"
2527
+ },
2528
+ "num_fewshot": 5,
2529
+ "metric_list": [
2530
+ {
2531
+ "metric": "acc",
2532
+ "aggregation": "mean",
2533
+ "higher_is_better": true
2534
+ }
2535
+ ],
2536
+ "output_type": "multiple_choice",
2537
+ "repeats": 1,
2538
+ "should_decontaminate": false,
2539
+ "metadata": {
2540
+ "version": 0.0
2541
+ }
2542
+ },
2543
+ "mmlu_world_religions": {
2544
+ "task": "mmlu_world_religions",
2545
+ "task_alias": "world_religions",
2546
+ "group": "mmlu_humanities",
2547
+ "group_alias": "humanities",
2548
+ "dataset_path": "hails/mmlu_no_train",
2549
+ "dataset_name": "world_religions",
2550
+ "test_split": "test",
2551
+ "fewshot_split": "dev",
2552
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2553
+ "doc_to_target": "answer",
2554
+ "doc_to_choice": [
2555
+ "A",
2556
+ "B",
2557
+ "C",
2558
+ "D"
2559
+ ],
2560
+ "description": "The following are multiple choice questions (with answers) about world religions.\n\n",
2561
+ "target_delimiter": " ",
2562
+ "fewshot_delimiter": "\n\n",
2563
+ "fewshot_config": {
2564
+ "sampler": "first_n"
2565
+ },
2566
+ "num_fewshot": 5,
2567
+ "metric_list": [
2568
+ {
2569
+ "metric": "acc",
2570
+ "aggregation": "mean",
2571
+ "higher_is_better": true
2572
+ }
2573
+ ],
2574
+ "output_type": "multiple_choice",
2575
+ "repeats": 1,
2576
+ "should_decontaminate": false,
2577
+ "metadata": {
2578
+ "version": 0.0
2579
+ }
2580
+ }
2581
+ },
2582
+ "versions": {
2583
+ "mmlu_abstract_algebra": 0.0,
2584
+ "mmlu_anatomy": 0.0,
2585
+ "mmlu_astronomy": 0.0,
2586
+ "mmlu_business_ethics": 0.0,
2587
+ "mmlu_clinical_knowledge": 0.0,
2588
+ "mmlu_college_biology": 0.0,
2589
+ "mmlu_college_chemistry": 0.0,
2590
+ "mmlu_college_computer_science": 0.0,
2591
+ "mmlu_college_mathematics": 0.0,
2592
+ "mmlu_college_medicine": 0.0,
2593
+ "mmlu_college_physics": 0.0,
2594
+ "mmlu_computer_security": 0.0,
2595
+ "mmlu_conceptual_physics": 0.0,
2596
+ "mmlu_econometrics": 0.0,
2597
+ "mmlu_electrical_engineering": 0.0,
2598
+ "mmlu_elementary_mathematics": 0.0,
2599
+ "mmlu_formal_logic": 0.0,
2600
+ "mmlu_global_facts": 0.0,
2601
+ "mmlu_high_school_biology": 0.0,
2602
+ "mmlu_high_school_chemistry": 0.0,
2603
+ "mmlu_high_school_computer_science": 0.0,
2604
+ "mmlu_high_school_european_history": 0.0,
2605
+ "mmlu_high_school_geography": 0.0,
2606
+ "mmlu_high_school_government_and_politics": 0.0,
2607
+ "mmlu_high_school_macroeconomics": 0.0,
2608
+ "mmlu_high_school_mathematics": 0.0,
2609
+ "mmlu_high_school_microeconomics": 0.0,
2610
+ "mmlu_high_school_physics": 0.0,
2611
+ "mmlu_high_school_psychology": 0.0,
2612
+ "mmlu_high_school_statistics": 0.0,
2613
+ "mmlu_high_school_us_history": 0.0,
2614
+ "mmlu_high_school_world_history": 0.0,
2615
+ "mmlu_human_aging": 0.0,
2616
+ "mmlu_human_sexuality": 0.0,
2617
+ "mmlu_international_law": 0.0,
2618
+ "mmlu_jurisprudence": 0.0,
2619
+ "mmlu_logical_fallacies": 0.0,
2620
+ "mmlu_machine_learning": 0.0,
2621
+ "mmlu_management": 0.0,
2622
+ "mmlu_marketing": 0.0,
2623
+ "mmlu_medical_genetics": 0.0,
2624
+ "mmlu_miscellaneous": 0.0,
2625
+ "mmlu_moral_disputes": 0.0,
2626
+ "mmlu_moral_scenarios": 0.0,
2627
+ "mmlu_nutrition": 0.0,
2628
+ "mmlu_philosophy": 0.0,
2629
+ "mmlu_prehistory": 0.0,
2630
+ "mmlu_professional_accounting": 0.0,
2631
+ "mmlu_professional_law": 0.0,
2632
+ "mmlu_professional_medicine": 0.0,
2633
+ "mmlu_professional_psychology": 0.0,
2634
+ "mmlu_public_relations": 0.0,
2635
+ "mmlu_security_studies": 0.0,
2636
+ "mmlu_sociology": 0.0,
2637
+ "mmlu_us_foreign_policy": 0.0,
2638
+ "mmlu_virology": 0.0,
2639
+ "mmlu_world_religions": 0.0
2640
+ },
2641
+ "n-shot": {
2642
+ "mmlu": 0,
2643
+ "mmlu_abstract_algebra": 5,
2644
+ "mmlu_anatomy": 5,
2645
+ "mmlu_astronomy": 5,
2646
+ "mmlu_business_ethics": 5,
2647
+ "mmlu_clinical_knowledge": 5,
2648
+ "mmlu_college_biology": 5,
2649
+ "mmlu_college_chemistry": 5,
2650
+ "mmlu_college_computer_science": 5,
2651
+ "mmlu_college_mathematics": 5,
2652
+ "mmlu_college_medicine": 5,
2653
+ "mmlu_college_physics": 5,
2654
+ "mmlu_computer_security": 5,
2655
+ "mmlu_conceptual_physics": 5,
2656
+ "mmlu_econometrics": 5,
2657
+ "mmlu_electrical_engineering": 5,
2658
+ "mmlu_elementary_mathematics": 5,
2659
+ "mmlu_formal_logic": 5,
2660
+ "mmlu_global_facts": 5,
2661
+ "mmlu_high_school_biology": 5,
2662
+ "mmlu_high_school_chemistry": 5,
2663
+ "mmlu_high_school_computer_science": 5,
2664
+ "mmlu_high_school_european_history": 5,
2665
+ "mmlu_high_school_geography": 5,
2666
+ "mmlu_high_school_government_and_politics": 5,
2667
+ "mmlu_high_school_macroeconomics": 5,
2668
+ "mmlu_high_school_mathematics": 5,
2669
+ "mmlu_high_school_microeconomics": 5,
2670
+ "mmlu_high_school_physics": 5,
2671
+ "mmlu_high_school_psychology": 5,
2672
+ "mmlu_high_school_statistics": 5,
2673
+ "mmlu_high_school_us_history": 5,
2674
+ "mmlu_high_school_world_history": 5,
2675
+ "mmlu_human_aging": 5,
2676
+ "mmlu_human_sexuality": 5,
2677
+ "mmlu_humanities": 5,
2678
+ "mmlu_international_law": 5,
2679
+ "mmlu_jurisprudence": 5,
2680
+ "mmlu_logical_fallacies": 5,
2681
+ "mmlu_machine_learning": 5,
2682
+ "mmlu_management": 5,
2683
+ "mmlu_marketing": 5,
2684
+ "mmlu_medical_genetics": 5,
2685
+ "mmlu_miscellaneous": 5,
2686
+ "mmlu_moral_disputes": 5,
2687
+ "mmlu_moral_scenarios": 5,
2688
+ "mmlu_nutrition": 5,
2689
+ "mmlu_other": 5,
2690
+ "mmlu_philosophy": 5,
2691
+ "mmlu_prehistory": 5,
2692
+ "mmlu_professional_accounting": 5,
2693
+ "mmlu_professional_law": 5,
2694
+ "mmlu_professional_medicine": 5,
2695
+ "mmlu_professional_psychology": 5,
2696
+ "mmlu_public_relations": 5,
2697
+ "mmlu_security_studies": 5,
2698
+ "mmlu_social_sciences": 5,
2699
+ "mmlu_sociology": 5,
2700
+ "mmlu_stem": 5,
2701
+ "mmlu_us_foreign_policy": 5,
2702
+ "mmlu_virology": 5,
2703
+ "mmlu_world_religions": 5
2704
+ },
2705
+ "higher_is_better": {
2706
+ "mmlu": {
2707
+ "acc": true
2708
+ },
2709
+ "mmlu_abstract_algebra": {
2710
+ "acc": true
2711
+ },
2712
+ "mmlu_anatomy": {
2713
+ "acc": true
2714
+ },
2715
+ "mmlu_astronomy": {
2716
+ "acc": true
2717
+ },
2718
+ "mmlu_business_ethics": {
2719
+ "acc": true
2720
+ },
2721
+ "mmlu_clinical_knowledge": {
2722
+ "acc": true
2723
+ },
2724
+ "mmlu_college_biology": {
2725
+ "acc": true
2726
+ },
2727
+ "mmlu_college_chemistry": {
2728
+ "acc": true
2729
+ },
2730
+ "mmlu_college_computer_science": {
2731
+ "acc": true
2732
+ },
2733
+ "mmlu_college_mathematics": {
2734
+ "acc": true
2735
+ },
2736
+ "mmlu_college_medicine": {
2737
+ "acc": true
2738
+ },
2739
+ "mmlu_college_physics": {
2740
+ "acc": true
2741
+ },
2742
+ "mmlu_computer_security": {
2743
+ "acc": true
2744
+ },
2745
+ "mmlu_conceptual_physics": {
2746
+ "acc": true
2747
+ },
2748
+ "mmlu_econometrics": {
2749
+ "acc": true
2750
+ },
2751
+ "mmlu_electrical_engineering": {
2752
+ "acc": true
2753
+ },
2754
+ "mmlu_elementary_mathematics": {
2755
+ "acc": true
2756
+ },
2757
+ "mmlu_formal_logic": {
2758
+ "acc": true
2759
+ },
2760
+ "mmlu_global_facts": {
2761
+ "acc": true
2762
+ },
2763
+ "mmlu_high_school_biology": {
2764
+ "acc": true
2765
+ },
2766
+ "mmlu_high_school_chemistry": {
2767
+ "acc": true
2768
+ },
2769
+ "mmlu_high_school_computer_science": {
2770
+ "acc": true
2771
+ },
2772
+ "mmlu_high_school_european_history": {
2773
+ "acc": true
2774
+ },
2775
+ "mmlu_high_school_geography": {
2776
+ "acc": true
2777
+ },
2778
+ "mmlu_high_school_government_and_politics": {
2779
+ "acc": true
2780
+ },
2781
+ "mmlu_high_school_macroeconomics": {
2782
+ "acc": true
2783
+ },
2784
+ "mmlu_high_school_mathematics": {
2785
+ "acc": true
2786
+ },
2787
+ "mmlu_high_school_microeconomics": {
2788
+ "acc": true
2789
+ },
2790
+ "mmlu_high_school_physics": {
2791
+ "acc": true
2792
+ },
2793
+ "mmlu_high_school_psychology": {
2794
+ "acc": true
2795
+ },
2796
+ "mmlu_high_school_statistics": {
2797
+ "acc": true
2798
+ },
2799
+ "mmlu_high_school_us_history": {
2800
+ "acc": true
2801
+ },
2802
+ "mmlu_high_school_world_history": {
2803
+ "acc": true
2804
+ },
2805
+ "mmlu_human_aging": {
2806
+ "acc": true
2807
+ },
2808
+ "mmlu_human_sexuality": {
2809
+ "acc": true
2810
+ },
2811
+ "mmlu_humanities": {
2812
+ "acc": true
2813
+ },
2814
+ "mmlu_international_law": {
2815
+ "acc": true
2816
+ },
2817
+ "mmlu_jurisprudence": {
2818
+ "acc": true
2819
+ },
2820
+ "mmlu_logical_fallacies": {
2821
+ "acc": true
2822
+ },
2823
+ "mmlu_machine_learning": {
2824
+ "acc": true
2825
+ },
2826
+ "mmlu_management": {
2827
+ "acc": true
2828
+ },
2829
+ "mmlu_marketing": {
2830
+ "acc": true
2831
+ },
2832
+ "mmlu_medical_genetics": {
2833
+ "acc": true
2834
+ },
2835
+ "mmlu_miscellaneous": {
2836
+ "acc": true
2837
+ },
2838
+ "mmlu_moral_disputes": {
2839
+ "acc": true
2840
+ },
2841
+ "mmlu_moral_scenarios": {
2842
+ "acc": true
2843
+ },
2844
+ "mmlu_nutrition": {
2845
+ "acc": true
2846
+ },
2847
+ "mmlu_other": {
2848
+ "acc": true
2849
+ },
2850
+ "mmlu_philosophy": {
2851
+ "acc": true
2852
+ },
2853
+ "mmlu_prehistory": {
2854
+ "acc": true
2855
+ },
2856
+ "mmlu_professional_accounting": {
2857
+ "acc": true
2858
+ },
2859
+ "mmlu_professional_law": {
2860
+ "acc": true
2861
+ },
2862
+ "mmlu_professional_medicine": {
2863
+ "acc": true
2864
+ },
2865
+ "mmlu_professional_psychology": {
2866
+ "acc": true
2867
+ },
2868
+ "mmlu_public_relations": {
2869
+ "acc": true
2870
+ },
2871
+ "mmlu_security_studies": {
2872
+ "acc": true
2873
+ },
2874
+ "mmlu_social_sciences": {
2875
+ "acc": true
2876
+ },
2877
+ "mmlu_sociology": {
2878
+ "acc": true
2879
+ },
2880
+ "mmlu_stem": {
2881
+ "acc": true
2882
+ },
2883
+ "mmlu_us_foreign_policy": {
2884
+ "acc": true
2885
+ },
2886
+ "mmlu_virology": {
2887
+ "acc": true
2888
+ },
2889
+ "mmlu_world_religions": {
2890
+ "acc": true
2891
+ }
2892
+ },
2893
+ "n-samples": {
2894
+ "mmlu_moral_scenarios": {
2895
+ "original": 895,
2896
+ "effective": 895
2897
+ },
2898
+ "mmlu_high_school_us_history": {
2899
+ "original": 204,
2900
+ "effective": 204
2901
+ },
2902
+ "mmlu_high_school_world_history": {
2903
+ "original": 237,
2904
+ "effective": 237
2905
+ },
2906
+ "mmlu_world_religions": {
2907
+ "original": 171,
2908
+ "effective": 171
2909
+ },
2910
+ "mmlu_formal_logic": {
2911
+ "original": 126,
2912
+ "effective": 126
2913
+ },
2914
+ "mmlu_moral_disputes": {
2915
+ "original": 346,
2916
+ "effective": 346
2917
+ },
2918
+ "mmlu_prehistory": {
2919
+ "original": 324,
2920
+ "effective": 324
2921
+ },
2922
+ "mmlu_international_law": {
2923
+ "original": 121,
2924
+ "effective": 121
2925
+ },
2926
+ "mmlu_logical_fallacies": {
2927
+ "original": 163,
2928
+ "effective": 163
2929
+ },
2930
+ "mmlu_professional_law": {
2931
+ "original": 1534,
2932
+ "effective": 1534
2933
+ },
2934
+ "mmlu_philosophy": {
2935
+ "original": 311,
2936
+ "effective": 311
2937
+ },
2938
+ "mmlu_high_school_european_history": {
2939
+ "original": 165,
2940
+ "effective": 165
2941
+ },
2942
+ "mmlu_jurisprudence": {
2943
+ "original": 108,
2944
+ "effective": 108
2945
+ },
2946
+ "mmlu_high_school_psychology": {
2947
+ "original": 545,
2948
+ "effective": 545
2949
+ },
2950
+ "mmlu_high_school_geography": {
2951
+ "original": 198,
2952
+ "effective": 198
2953
+ },
2954
+ "mmlu_high_school_macroeconomics": {
2955
+ "original": 390,
2956
+ "effective": 390
2957
+ },
2958
+ "mmlu_public_relations": {
2959
+ "original": 110,
2960
+ "effective": 110
2961
+ },
2962
+ "mmlu_security_studies": {
2963
+ "original": 245,
2964
+ "effective": 245
2965
+ },
2966
+ "mmlu_high_school_microeconomics": {
2967
+ "original": 238,
2968
+ "effective": 238
2969
+ },
2970
+ "mmlu_human_sexuality": {
2971
+ "original": 131,
2972
+ "effective": 131
2973
+ },
2974
+ "mmlu_sociology": {
2975
+ "original": 201,
2976
+ "effective": 201
2977
+ },
2978
+ "mmlu_professional_psychology": {
2979
+ "original": 612,
2980
+ "effective": 612
2981
+ },
2982
+ "mmlu_econometrics": {
2983
+ "original": 114,
2984
+ "effective": 114
2985
+ },
2986
+ "mmlu_us_foreign_policy": {
2987
+ "original": 100,
2988
+ "effective": 100
2989
+ },
2990
+ "mmlu_high_school_government_and_politics": {
2991
+ "original": 193,
2992
+ "effective": 193
2993
+ },
2994
+ "mmlu_marketing": {
2995
+ "original": 234,
2996
+ "effective": 234
2997
+ },
2998
+ "mmlu_professional_accounting": {
2999
+ "original": 282,
3000
+ "effective": 282
3001
+ },
3002
+ "mmlu_clinical_knowledge": {
3003
+ "original": 265,
3004
+ "effective": 265
3005
+ },
3006
+ "mmlu_college_medicine": {
3007
+ "original": 173,
3008
+ "effective": 173
3009
+ },
3010
+ "mmlu_miscellaneous": {
3011
+ "original": 783,
3012
+ "effective": 783
3013
+ },
3014
+ "mmlu_virology": {
3015
+ "original": 166,
3016
+ "effective": 166
3017
+ },
3018
+ "mmlu_business_ethics": {
3019
+ "original": 100,
3020
+ "effective": 100
3021
+ },
3022
+ "mmlu_professional_medicine": {
3023
+ "original": 272,
3024
+ "effective": 272
3025
+ },
3026
+ "mmlu_global_facts": {
3027
+ "original": 100,
3028
+ "effective": 100
3029
+ },
3030
+ "mmlu_nutrition": {
3031
+ "original": 306,
3032
+ "effective": 306
3033
+ },
3034
+ "mmlu_human_aging": {
3035
+ "original": 223,
3036
+ "effective": 223
3037
+ },
3038
+ "mmlu_management": {
3039
+ "original": 103,
3040
+ "effective": 103
3041
+ },
3042
+ "mmlu_medical_genetics": {
3043
+ "original": 100,
3044
+ "effective": 100
3045
+ },
3046
+ "mmlu_college_biology": {
3047
+ "original": 144,
3048
+ "effective": 144
3049
+ },
3050
+ "mmlu_high_school_computer_science": {
3051
+ "original": 100,
3052
+ "effective": 100
3053
+ },
3054
+ "mmlu_elementary_mathematics": {
3055
+ "original": 378,
3056
+ "effective": 378
3057
+ },
3058
+ "mmlu_astronomy": {
3059
+ "original": 152,
3060
+ "effective": 152
3061
+ },
3062
+ "mmlu_machine_learning": {
3063
+ "original": 112,
3064
+ "effective": 112
3065
+ },
3066
+ "mmlu_high_school_mathematics": {
3067
+ "original": 270,
3068
+ "effective": 270
3069
+ },
3070
+ "mmlu_electrical_engineering": {
3071
+ "original": 145,
3072
+ "effective": 145
3073
+ },
3074
+ "mmlu_college_chemistry": {
3075
+ "original": 100,
3076
+ "effective": 100
3077
+ },
3078
+ "mmlu_college_mathematics": {
3079
+ "original": 100,
3080
+ "effective": 100
3081
+ },
3082
+ "mmlu_high_school_statistics": {
3083
+ "original": 216,
3084
+ "effective": 216
3085
+ },
3086
+ "mmlu_high_school_biology": {
3087
+ "original": 310,
3088
+ "effective": 310
3089
+ },
3090
+ "mmlu_abstract_algebra": {
3091
+ "original": 100,
3092
+ "effective": 100
3093
+ },
3094
+ "mmlu_college_physics": {
3095
+ "original": 102,
3096
+ "effective": 102
3097
+ },
3098
+ "mmlu_conceptual_physics": {
3099
+ "original": 235,
3100
+ "effective": 235
3101
+ },
3102
+ "mmlu_computer_security": {
3103
+ "original": 100,
3104
+ "effective": 100
3105
+ },
3106
+ "mmlu_anatomy": {
3107
+ "original": 135,
3108
+ "effective": 135
3109
+ },
3110
+ "mmlu_college_computer_science": {
3111
+ "original": 100,
3112
+ "effective": 100
3113
+ },
3114
+ "mmlu_high_school_physics": {
3115
+ "original": 151,
3116
+ "effective": 151
3117
+ },
3118
+ "mmlu_high_school_chemistry": {
3119
+ "original": 203,
3120
+ "effective": 203
3121
+ }
3122
+ },
3123
+ "config": {
3124
+ "model": "hf",
3125
+ "model_args": "pretrained=/home/migel/Tess-v2.5-qwen2-72B-safetensors,parallelize=True",
3126
+ "model_num_parameters": 72706203648,
3127
+ "model_dtype": "torch.float16",
3128
+ "model_revision": "main",
3129
+ "model_sha": "",
3130
+ "batch_size": "8",
3131
+ "batch_sizes": [],
3132
+ "device": null,
3133
+ "use_cache": null,
3134
+ "limit": null,
3135
+ "bootstrap_iters": 100000,
3136
+ "gen_kwargs": null,
3137
+ "random_seed": 0,
3138
+ "numpy_seed": 1234,
3139
+ "torch_seed": 1234,
3140
+ "fewshot_seed": 1234
3141
+ },
3142
+ "git_hash": "b3e4c49a",
3143
+ "date": 1718167288.656124,
3144
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.31\n\nPython version: 3.10.14 (main, Apr 6 2024, 18:45:05) [GCC 9.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1050-azure-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\nGPU 2: NVIDIA A100 80GB PCIe\nGPU 3: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 530.30.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 48 bits physical, 48 bits virtual\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nNUMA node(s): 4\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7V13 64-Core Processor\nStepping: 1\nCPU MHz: 2445.435\nBogoMIPS: 4890.87\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB\nL1i cache: 3 MiB\nL2 cache: 48 MiB\nL3 cache: 384 MiB\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] magma-cuda117 2.6.1 1 pytorch\n[conda] mkl 2022.2.1 pypi_0 pypi\n[conda] mkl-include 2022.2.1 pypi_0 pypi\n[conda] numpy 1.24.4 pypi_0 pypi\n[conda] pytorch-lightning 1.9.5 pypi_0 pypi\n[conda] torch 2.0.1 pypi_0 pypi\n[conda] torch-nebula 0.16.10 pypi_0 pypi\n[conda] torch-ort 1.17.0 pypi_0 pypi\n[conda] torchaudio 2.0.2+cu117 pypi_0 pypi\n[conda] torchdata 0.6.1 pypi_0 pypi\n[conda] torchmetrics 1.2.0 pypi_0 pypi\n[conda] torchsnapshot 0.1.0 pypi_0 pypi\n[conda] torchvision 0.15.2+cu117 pypi_0 pypi\n[conda] triton 2.0.0 pypi_0 pypi",
3145
+ "transformers_version": "4.41.1",
3146
+ "upper_git_hash": null,
3147
+ "task_hashes": {},
3148
+ "model_source": "hf",
3149
+ "model_name": "/home/migel/Tess-v2.5-qwen2-72B-safetensors",
3150
+ "model_name_sanitized": "__home__migel__Tess-v2.5-qwen2-72B-safetensors",
3151
+ "system_instruction": null,
3152
+ "system_instruction_sha": null,
3153
+ "chat_template": null,
3154
+ "chat_template_sha": null,
3155
+ "start_time": 380863.826540975,
3156
+ "end_time": 388726.503174757,
3157
+ "total_evaluation_time_seconds": "7862.676633781986"
3158
+ }