migtissera commited on
Commit
564e6fe
1 Parent(s): d048d70

Delete Tess-v2.5-Qwen2-72B-mmlu.json

Browse files
Files changed (1) hide show
  1. Tess-v2.5-Qwen2-72B-mmlu.json +0 -3158
Tess-v2.5-Qwen2-72B-mmlu.json DELETED
@@ -1,3158 +0,0 @@
1
- {
2
- "results": {
3
- "mmlu": {
4
- "acc,none": 0.8439680957128615,
5
- "acc_stderr,none": 0.0029499711040394372,
6
- "alias": "mmlu"
7
- },
8
- "mmlu_humanities": {
9
- "alias": " - humanities",
10
- "acc,none": 0.8146652497343252,
11
- "acc_stderr,none": 0.005505402478774841
12
- },
13
- "mmlu_formal_logic": {
14
- "alias": " - formal_logic",
15
- "acc,none": 0.7301587301587301,
16
- "acc_stderr,none": 0.03970158273235173
17
- },
18
- "mmlu_high_school_european_history": {
19
- "alias": " - high_school_european_history",
20
- "acc,none": 0.8909090909090909,
21
- "acc_stderr,none": 0.02434383813514564
22
- },
23
- "mmlu_high_school_us_history": {
24
- "alias": " - high_school_us_history",
25
- "acc,none": 0.9509803921568627,
26
- "acc_stderr,none": 0.01515383934021267
27
- },
28
- "mmlu_high_school_world_history": {
29
- "alias": " - high_school_world_history",
30
- "acc,none": 0.9409282700421941,
31
- "acc_stderr,none": 0.01534659746388869
32
- },
33
- "mmlu_international_law": {
34
- "alias": " - international_law",
35
- "acc,none": 0.9173553719008265,
36
- "acc_stderr,none": 0.025135382356604227
37
- },
38
- "mmlu_jurisprudence": {
39
- "alias": " - jurisprudence",
40
- "acc,none": 0.8796296296296297,
41
- "acc_stderr,none": 0.031457038543062525
42
- },
43
- "mmlu_logical_fallacies": {
44
- "alias": " - logical_fallacies",
45
- "acc,none": 0.901840490797546,
46
- "acc_stderr,none": 0.023376180231059605
47
- },
48
- "mmlu_moral_disputes": {
49
- "alias": " - moral_disputes",
50
- "acc,none": 0.869942196531792,
51
- "acc_stderr,none": 0.01810939152822133
52
- },
53
- "mmlu_moral_scenarios": {
54
- "alias": " - moral_scenarios",
55
- "acc,none": 0.829050279329609,
56
- "acc_stderr,none": 0.012590873868789222
57
- },
58
- "mmlu_philosophy": {
59
- "alias": " - philosophy",
60
- "acc,none": 0.8713826366559485,
61
- "acc_stderr,none": 0.01901399630412152
62
- },
63
- "mmlu_prehistory": {
64
- "alias": " - prehistory",
65
- "acc,none": 0.9104938271604939,
66
- "acc_stderr,none": 0.015884141073937555
67
- },
68
- "mmlu_professional_law": {
69
- "alias": " - professional_law",
70
- "acc,none": 0.6929595827900913,
71
- "acc_stderr,none": 0.011780959114513764
72
- },
73
- "mmlu_world_religions": {
74
- "alias": " - world_religions",
75
- "acc,none": 0.8888888888888888,
76
- "acc_stderr,none": 0.024103384202072864
77
- },
78
- "mmlu_other": {
79
- "alias": " - other",
80
- "acc,none": 0.8625683939491471,
81
- "acc_stderr,none": 0.005895325056685939
82
- },
83
- "mmlu_business_ethics": {
84
- "alias": " - business_ethics",
85
- "acc,none": 0.78,
86
- "acc_stderr,none": 0.04163331998932263
87
- },
88
- "mmlu_clinical_knowledge": {
89
- "alias": " - clinical_knowledge",
90
- "acc,none": 0.8716981132075472,
91
- "acc_stderr,none": 0.02058247568799185
92
- },
93
- "mmlu_college_medicine": {
94
- "alias": " - college_medicine",
95
- "acc,none": 0.8323699421965318,
96
- "acc_stderr,none": 0.028481963032143395
97
- },
98
- "mmlu_global_facts": {
99
- "alias": " - global_facts",
100
- "acc,none": 0.61,
101
- "acc_stderr,none": 0.04902071300001975
102
- },
103
- "mmlu_human_aging": {
104
- "alias": " - human_aging",
105
- "acc,none": 0.8565022421524664,
106
- "acc_stderr,none": 0.0235293712696182
107
- },
108
- "mmlu_management": {
109
- "alias": " - management",
110
- "acc,none": 0.9223300970873787,
111
- "acc_stderr,none": 0.026501440784762766
112
- },
113
- "mmlu_marketing": {
114
- "alias": " - marketing",
115
- "acc,none": 0.9487179487179487,
116
- "acc_stderr,none": 0.014450181176872726
117
- },
118
- "mmlu_medical_genetics": {
119
- "alias": " - medical_genetics",
120
- "acc,none": 0.9,
121
- "acc_stderr,none": 0.030151134457776348
122
- },
123
- "mmlu_miscellaneous": {
124
- "alias": " - miscellaneous",
125
- "acc,none": 0.9501915708812261,
126
- "acc_stderr,none": 0.0077795348866793465
127
- },
128
- "mmlu_nutrition": {
129
- "alias": " - nutrition",
130
- "acc,none": 0.9019607843137255,
131
- "acc_stderr,none": 0.017027222935582193
132
- },
133
- "mmlu_professional_accounting": {
134
- "alias": " - professional_accounting",
135
- "acc,none": 0.75177304964539,
136
- "acc_stderr,none": 0.025770015644290392
137
- },
138
- "mmlu_professional_medicine": {
139
- "alias": " - professional_medicine",
140
- "acc,none": 0.8897058823529411,
141
- "acc_stderr,none": 0.019028947191474497
142
- },
143
- "mmlu_virology": {
144
- "alias": " - virology",
145
- "acc,none": 0.5662650602409639,
146
- "acc_stderr,none": 0.03858158940685517
147
- },
148
- "mmlu_social_sciences": {
149
- "alias": " - social_sciences",
150
- "acc,none": 0.9038024049398765,
151
- "acc_stderr,none": 0.005221504585802578
152
- },
153
- "mmlu_econometrics": {
154
- "alias": " - econometrics",
155
- "acc,none": 0.7280701754385965,
156
- "acc_stderr,none": 0.041857744240220554
157
- },
158
- "mmlu_high_school_geography": {
159
- "alias": " - high_school_geography",
160
- "acc,none": 0.9393939393939394,
161
- "acc_stderr,none": 0.016999994927421606
162
- },
163
- "mmlu_high_school_government_and_politics": {
164
- "alias": " - high_school_government_and_politics",
165
- "acc,none": 0.9896373056994818,
166
- "acc_stderr,none": 0.007308424386792201
167
- },
168
- "mmlu_high_school_macroeconomics": {
169
- "alias": " - high_school_macroeconomics",
170
- "acc,none": 0.8897435897435897,
171
- "acc_stderr,none": 0.015880331261056115
172
- },
173
- "mmlu_high_school_microeconomics": {
174
- "alias": " - high_school_microeconomics",
175
- "acc,none": 0.9411764705882353,
176
- "acc_stderr,none": 0.015283995352038402
177
- },
178
- "mmlu_high_school_psychology": {
179
- "alias": " - high_school_psychology",
180
- "acc,none": 0.9357798165137615,
181
- "acc_stderr,none": 0.010510494713201424
182
- },
183
- "mmlu_human_sexuality": {
184
- "alias": " - human_sexuality",
185
- "acc,none": 0.9083969465648855,
186
- "acc_stderr,none": 0.025300035578642965
187
- },
188
- "mmlu_professional_psychology": {
189
- "alias": " - professional_psychology",
190
- "acc,none": 0.8970588235294118,
191
- "acc_stderr,none": 0.012293751200845176
192
- },
193
- "mmlu_public_relations": {
194
- "alias": " - public_relations",
195
- "acc,none": 0.7454545454545455,
196
- "acc_stderr,none": 0.041723430387053825
197
- },
198
- "mmlu_security_studies": {
199
- "alias": " - security_studies",
200
- "acc,none": 0.8408163265306122,
201
- "acc_stderr,none": 0.023420972069166365
202
- },
203
- "mmlu_sociology": {
204
- "alias": " - sociology",
205
- "acc,none": 0.945273631840796,
206
- "acc_stderr,none": 0.016082815796263254
207
- },
208
- "mmlu_us_foreign_policy": {
209
- "alias": " - us_foreign_policy",
210
- "acc,none": 0.94,
211
- "acc_stderr,none": 0.02386832565759419
212
- },
213
- "mmlu_stem": {
214
- "alias": " - stem",
215
- "acc,none": 0.8109736758642563,
216
- "acc_stderr,none": 0.0067376135296805745
217
- },
218
- "mmlu_abstract_algebra": {
219
- "alias": " - abstract_algebra",
220
- "acc,none": 0.66,
221
- "acc_stderr,none": 0.04760952285695237
222
- },
223
- "mmlu_anatomy": {
224
- "alias": " - anatomy",
225
- "acc,none": 0.7925925925925926,
226
- "acc_stderr,none": 0.03502553170678317
227
- },
228
- "mmlu_astronomy": {
229
- "alias": " - astronomy",
230
- "acc,none": 0.9276315789473685,
231
- "acc_stderr,none": 0.021085011261884112
232
- },
233
- "mmlu_college_biology": {
234
- "alias": " - college_biology",
235
- "acc,none": 0.9444444444444444,
236
- "acc_stderr,none": 0.01915507853243362
237
- },
238
- "mmlu_college_chemistry": {
239
- "alias": " - college_chemistry",
240
- "acc,none": 0.58,
241
- "acc_stderr,none": 0.049604496374885836
242
- },
243
- "mmlu_college_computer_science": {
244
- "alias": " - college_computer_science",
245
- "acc,none": 0.8,
246
- "acc_stderr,none": 0.040201512610368445
247
- },
248
- "mmlu_college_mathematics": {
249
- "alias": " - college_mathematics",
250
- "acc,none": 0.63,
251
- "acc_stderr,none": 0.04852365870939099
252
- },
253
- "mmlu_college_physics": {
254
- "alias": " - college_physics",
255
- "acc,none": 0.6470588235294118,
256
- "acc_stderr,none": 0.04755129616062947
257
- },
258
- "mmlu_computer_security": {
259
- "alias": " - computer_security",
260
- "acc,none": 0.83,
261
- "acc_stderr,none": 0.0377525168068637
262
- },
263
- "mmlu_conceptual_physics": {
264
- "alias": " - conceptual_physics",
265
- "acc,none": 0.8893617021276595,
266
- "acc_stderr,none": 0.020506145099008433
267
- },
268
- "mmlu_electrical_engineering": {
269
- "alias": " - electrical_engineering",
270
- "acc,none": 0.8275862068965517,
271
- "acc_stderr,none": 0.03147830790259575
272
- },
273
- "mmlu_elementary_mathematics": {
274
- "alias": " - elementary_mathematics",
275
- "acc,none": 0.8888888888888888,
276
- "acc_stderr,none": 0.01618571201620511
277
- },
278
- "mmlu_high_school_biology": {
279
- "alias": " - high_school_biology",
280
- "acc,none": 0.9419354838709677,
281
- "acc_stderr,none": 0.01330413811280927
282
- },
283
- "mmlu_high_school_chemistry": {
284
- "alias": " - high_school_chemistry",
285
- "acc,none": 0.7980295566502463,
286
- "acc_stderr,none": 0.028247350122180243
287
- },
288
- "mmlu_high_school_computer_science": {
289
- "alias": " - high_school_computer_science",
290
- "acc,none": 0.91,
291
- "acc_stderr,none": 0.028762349126466115
292
- },
293
- "mmlu_high_school_mathematics": {
294
- "alias": " - high_school_mathematics",
295
- "acc,none": 0.6777777777777778,
296
- "acc_stderr,none": 0.028493465091028597
297
- },
298
- "mmlu_high_school_physics": {
299
- "alias": " - high_school_physics",
300
- "acc,none": 0.7284768211920529,
301
- "acc_stderr,none": 0.03631329803969654
302
- },
303
- "mmlu_high_school_statistics": {
304
- "alias": " - high_school_statistics",
305
- "acc,none": 0.7824074074074074,
306
- "acc_stderr,none": 0.028139689444859676
307
- },
308
- "mmlu_machine_learning": {
309
- "alias": " - machine_learning",
310
- "acc,none": 0.7589285714285714,
311
- "acc_stderr,none": 0.04059867246952685
312
- }
313
- },
314
- "groups": {
315
- "mmlu": {
316
- "acc,none": 0.8439680957128615,
317
- "acc_stderr,none": 0.0029499711040394372,
318
- "alias": "mmlu"
319
- },
320
- "mmlu_humanities": {
321
- "alias": " - humanities",
322
- "acc,none": 0.8146652497343252,
323
- "acc_stderr,none": 0.005505402478774841
324
- },
325
- "mmlu_other": {
326
- "alias": " - other",
327
- "acc,none": 0.8625683939491471,
328
- "acc_stderr,none": 0.005895325056685939
329
- },
330
- "mmlu_social_sciences": {
331
- "alias": " - social_sciences",
332
- "acc,none": 0.9038024049398765,
333
- "acc_stderr,none": 0.005221504585802578
334
- },
335
- "mmlu_stem": {
336
- "alias": " - stem",
337
- "acc,none": 0.8109736758642563,
338
- "acc_stderr,none": 0.0067376135296805745
339
- }
340
- },
341
- "group_subtasks": {
342
- "mmlu_stem": [
343
- "mmlu_college_biology",
344
- "mmlu_high_school_computer_science",
345
- "mmlu_elementary_mathematics",
346
- "mmlu_astronomy",
347
- "mmlu_machine_learning",
348
- "mmlu_high_school_mathematics",
349
- "mmlu_electrical_engineering",
350
- "mmlu_college_chemistry",
351
- "mmlu_college_mathematics",
352
- "mmlu_high_school_statistics",
353
- "mmlu_high_school_biology",
354
- "mmlu_abstract_algebra",
355
- "mmlu_college_physics",
356
- "mmlu_conceptual_physics",
357
- "mmlu_computer_security",
358
- "mmlu_anatomy",
359
- "mmlu_college_computer_science",
360
- "mmlu_high_school_physics",
361
- "mmlu_high_school_chemistry"
362
- ],
363
- "mmlu_other": [
364
- "mmlu_marketing",
365
- "mmlu_professional_accounting",
366
- "mmlu_clinical_knowledge",
367
- "mmlu_college_medicine",
368
- "mmlu_miscellaneous",
369
- "mmlu_virology",
370
- "mmlu_business_ethics",
371
- "mmlu_professional_medicine",
372
- "mmlu_global_facts",
373
- "mmlu_nutrition",
374
- "mmlu_human_aging",
375
- "mmlu_management",
376
- "mmlu_medical_genetics"
377
- ],
378
- "mmlu_social_sciences": [
379
- "mmlu_high_school_psychology",
380
- "mmlu_high_school_geography",
381
- "mmlu_high_school_macroeconomics",
382
- "mmlu_public_relations",
383
- "mmlu_security_studies",
384
- "mmlu_high_school_microeconomics",
385
- "mmlu_human_sexuality",
386
- "mmlu_sociology",
387
- "mmlu_professional_psychology",
388
- "mmlu_econometrics",
389
- "mmlu_us_foreign_policy",
390
- "mmlu_high_school_government_and_politics"
391
- ],
392
- "mmlu_humanities": [
393
- "mmlu_moral_scenarios",
394
- "mmlu_high_school_us_history",
395
- "mmlu_high_school_world_history",
396
- "mmlu_world_religions",
397
- "mmlu_formal_logic",
398
- "mmlu_moral_disputes",
399
- "mmlu_prehistory",
400
- "mmlu_international_law",
401
- "mmlu_logical_fallacies",
402
- "mmlu_professional_law",
403
- "mmlu_philosophy",
404
- "mmlu_high_school_european_history",
405
- "mmlu_jurisprudence"
406
- ],
407
- "mmlu": [
408
- "mmlu_humanities",
409
- "mmlu_social_sciences",
410
- "mmlu_other",
411
- "mmlu_stem"
412
- ]
413
- },
414
- "configs": {
415
- "mmlu_abstract_algebra": {
416
- "task": "mmlu_abstract_algebra",
417
- "task_alias": "abstract_algebra",
418
- "group": "mmlu_stem",
419
- "group_alias": "stem",
420
- "dataset_path": "hails/mmlu_no_train",
421
- "dataset_name": "abstract_algebra",
422
- "test_split": "test",
423
- "fewshot_split": "dev",
424
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
425
- "doc_to_target": "answer",
426
- "doc_to_choice": [
427
- "A",
428
- "B",
429
- "C",
430
- "D"
431
- ],
432
- "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
433
- "target_delimiter": " ",
434
- "fewshot_delimiter": "\n\n",
435
- "fewshot_config": {
436
- "sampler": "first_n"
437
- },
438
- "num_fewshot": 5,
439
- "metric_list": [
440
- {
441
- "metric": "acc",
442
- "aggregation": "mean",
443
- "higher_is_better": true
444
- }
445
- ],
446
- "output_type": "multiple_choice",
447
- "repeats": 1,
448
- "should_decontaminate": false,
449
- "metadata": {
450
- "version": 0.0
451
- }
452
- },
453
- "mmlu_anatomy": {
454
- "task": "mmlu_anatomy",
455
- "task_alias": "anatomy",
456
- "group": "mmlu_stem",
457
- "group_alias": "stem",
458
- "dataset_path": "hails/mmlu_no_train",
459
- "dataset_name": "anatomy",
460
- "test_split": "test",
461
- "fewshot_split": "dev",
462
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
463
- "doc_to_target": "answer",
464
- "doc_to_choice": [
465
- "A",
466
- "B",
467
- "C",
468
- "D"
469
- ],
470
- "description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
471
- "target_delimiter": " ",
472
- "fewshot_delimiter": "\n\n",
473
- "fewshot_config": {
474
- "sampler": "first_n"
475
- },
476
- "num_fewshot": 5,
477
- "metric_list": [
478
- {
479
- "metric": "acc",
480
- "aggregation": "mean",
481
- "higher_is_better": true
482
- }
483
- ],
484
- "output_type": "multiple_choice",
485
- "repeats": 1,
486
- "should_decontaminate": false,
487
- "metadata": {
488
- "version": 0.0
489
- }
490
- },
491
- "mmlu_astronomy": {
492
- "task": "mmlu_astronomy",
493
- "task_alias": "astronomy",
494
- "group": "mmlu_stem",
495
- "group_alias": "stem",
496
- "dataset_path": "hails/mmlu_no_train",
497
- "dataset_name": "astronomy",
498
- "test_split": "test",
499
- "fewshot_split": "dev",
500
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
501
- "doc_to_target": "answer",
502
- "doc_to_choice": [
503
- "A",
504
- "B",
505
- "C",
506
- "D"
507
- ],
508
- "description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
509
- "target_delimiter": " ",
510
- "fewshot_delimiter": "\n\n",
511
- "fewshot_config": {
512
- "sampler": "first_n"
513
- },
514
- "num_fewshot": 5,
515
- "metric_list": [
516
- {
517
- "metric": "acc",
518
- "aggregation": "mean",
519
- "higher_is_better": true
520
- }
521
- ],
522
- "output_type": "multiple_choice",
523
- "repeats": 1,
524
- "should_decontaminate": false,
525
- "metadata": {
526
- "version": 0.0
527
- }
528
- },
529
- "mmlu_business_ethics": {
530
- "task": "mmlu_business_ethics",
531
- "task_alias": "business_ethics",
532
- "group": "mmlu_other",
533
- "group_alias": "other",
534
- "dataset_path": "hails/mmlu_no_train",
535
- "dataset_name": "business_ethics",
536
- "test_split": "test",
537
- "fewshot_split": "dev",
538
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
539
- "doc_to_target": "answer",
540
- "doc_to_choice": [
541
- "A",
542
- "B",
543
- "C",
544
- "D"
545
- ],
546
- "description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
547
- "target_delimiter": " ",
548
- "fewshot_delimiter": "\n\n",
549
- "fewshot_config": {
550
- "sampler": "first_n"
551
- },
552
- "num_fewshot": 5,
553
- "metric_list": [
554
- {
555
- "metric": "acc",
556
- "aggregation": "mean",
557
- "higher_is_better": true
558
- }
559
- ],
560
- "output_type": "multiple_choice",
561
- "repeats": 1,
562
- "should_decontaminate": false,
563
- "metadata": {
564
- "version": 0.0
565
- }
566
- },
567
- "mmlu_clinical_knowledge": {
568
- "task": "mmlu_clinical_knowledge",
569
- "task_alias": "clinical_knowledge",
570
- "group": "mmlu_other",
571
- "group_alias": "other",
572
- "dataset_path": "hails/mmlu_no_train",
573
- "dataset_name": "clinical_knowledge",
574
- "test_split": "test",
575
- "fewshot_split": "dev",
576
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
577
- "doc_to_target": "answer",
578
- "doc_to_choice": [
579
- "A",
580
- "B",
581
- "C",
582
- "D"
583
- ],
584
- "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
585
- "target_delimiter": " ",
586
- "fewshot_delimiter": "\n\n",
587
- "fewshot_config": {
588
- "sampler": "first_n"
589
- },
590
- "num_fewshot": 5,
591
- "metric_list": [
592
- {
593
- "metric": "acc",
594
- "aggregation": "mean",
595
- "higher_is_better": true
596
- }
597
- ],
598
- "output_type": "multiple_choice",
599
- "repeats": 1,
600
- "should_decontaminate": false,
601
- "metadata": {
602
- "version": 0.0
603
- }
604
- },
605
- "mmlu_college_biology": {
606
- "task": "mmlu_college_biology",
607
- "task_alias": "college_biology",
608
- "group": "mmlu_stem",
609
- "group_alias": "stem",
610
- "dataset_path": "hails/mmlu_no_train",
611
- "dataset_name": "college_biology",
612
- "test_split": "test",
613
- "fewshot_split": "dev",
614
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
615
- "doc_to_target": "answer",
616
- "doc_to_choice": [
617
- "A",
618
- "B",
619
- "C",
620
- "D"
621
- ],
622
- "description": "The following are multiple choice questions (with answers) about college biology.\n\n",
623
- "target_delimiter": " ",
624
- "fewshot_delimiter": "\n\n",
625
- "fewshot_config": {
626
- "sampler": "first_n"
627
- },
628
- "num_fewshot": 5,
629
- "metric_list": [
630
- {
631
- "metric": "acc",
632
- "aggregation": "mean",
633
- "higher_is_better": true
634
- }
635
- ],
636
- "output_type": "multiple_choice",
637
- "repeats": 1,
638
- "should_decontaminate": false,
639
- "metadata": {
640
- "version": 0.0
641
- }
642
- },
643
- "mmlu_college_chemistry": {
644
- "task": "mmlu_college_chemistry",
645
- "task_alias": "college_chemistry",
646
- "group": "mmlu_stem",
647
- "group_alias": "stem",
648
- "dataset_path": "hails/mmlu_no_train",
649
- "dataset_name": "college_chemistry",
650
- "test_split": "test",
651
- "fewshot_split": "dev",
652
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
653
- "doc_to_target": "answer",
654
- "doc_to_choice": [
655
- "A",
656
- "B",
657
- "C",
658
- "D"
659
- ],
660
- "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
661
- "target_delimiter": " ",
662
- "fewshot_delimiter": "\n\n",
663
- "fewshot_config": {
664
- "sampler": "first_n"
665
- },
666
- "num_fewshot": 5,
667
- "metric_list": [
668
- {
669
- "metric": "acc",
670
- "aggregation": "mean",
671
- "higher_is_better": true
672
- }
673
- ],
674
- "output_type": "multiple_choice",
675
- "repeats": 1,
676
- "should_decontaminate": false,
677
- "metadata": {
678
- "version": 0.0
679
- }
680
- },
681
- "mmlu_college_computer_science": {
682
- "task": "mmlu_college_computer_science",
683
- "task_alias": "college_computer_science",
684
- "group": "mmlu_stem",
685
- "group_alias": "stem",
686
- "dataset_path": "hails/mmlu_no_train",
687
- "dataset_name": "college_computer_science",
688
- "test_split": "test",
689
- "fewshot_split": "dev",
690
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
691
- "doc_to_target": "answer",
692
- "doc_to_choice": [
693
- "A",
694
- "B",
695
- "C",
696
- "D"
697
- ],
698
- "description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
699
- "target_delimiter": " ",
700
- "fewshot_delimiter": "\n\n",
701
- "fewshot_config": {
702
- "sampler": "first_n"
703
- },
704
- "num_fewshot": 5,
705
- "metric_list": [
706
- {
707
- "metric": "acc",
708
- "aggregation": "mean",
709
- "higher_is_better": true
710
- }
711
- ],
712
- "output_type": "multiple_choice",
713
- "repeats": 1,
714
- "should_decontaminate": false,
715
- "metadata": {
716
- "version": 0.0
717
- }
718
- },
719
- "mmlu_college_mathematics": {
720
- "task": "mmlu_college_mathematics",
721
- "task_alias": "college_mathematics",
722
- "group": "mmlu_stem",
723
- "group_alias": "stem",
724
- "dataset_path": "hails/mmlu_no_train",
725
- "dataset_name": "college_mathematics",
726
- "test_split": "test",
727
- "fewshot_split": "dev",
728
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
729
- "doc_to_target": "answer",
730
- "doc_to_choice": [
731
- "A",
732
- "B",
733
- "C",
734
- "D"
735
- ],
736
- "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
737
- "target_delimiter": " ",
738
- "fewshot_delimiter": "\n\n",
739
- "fewshot_config": {
740
- "sampler": "first_n"
741
- },
742
- "num_fewshot": 5,
743
- "metric_list": [
744
- {
745
- "metric": "acc",
746
- "aggregation": "mean",
747
- "higher_is_better": true
748
- }
749
- ],
750
- "output_type": "multiple_choice",
751
- "repeats": 1,
752
- "should_decontaminate": false,
753
- "metadata": {
754
- "version": 0.0
755
- }
756
- },
757
- "mmlu_college_medicine": {
758
- "task": "mmlu_college_medicine",
759
- "task_alias": "college_medicine",
760
- "group": "mmlu_other",
761
- "group_alias": "other",
762
- "dataset_path": "hails/mmlu_no_train",
763
- "dataset_name": "college_medicine",
764
- "test_split": "test",
765
- "fewshot_split": "dev",
766
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
767
- "doc_to_target": "answer",
768
- "doc_to_choice": [
769
- "A",
770
- "B",
771
- "C",
772
- "D"
773
- ],
774
- "description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
775
- "target_delimiter": " ",
776
- "fewshot_delimiter": "\n\n",
777
- "fewshot_config": {
778
- "sampler": "first_n"
779
- },
780
- "num_fewshot": 5,
781
- "metric_list": [
782
- {
783
- "metric": "acc",
784
- "aggregation": "mean",
785
- "higher_is_better": true
786
- }
787
- ],
788
- "output_type": "multiple_choice",
789
- "repeats": 1,
790
- "should_decontaminate": false,
791
- "metadata": {
792
- "version": 0.0
793
- }
794
- },
795
- "mmlu_college_physics": {
796
- "task": "mmlu_college_physics",
797
- "task_alias": "college_physics",
798
- "group": "mmlu_stem",
799
- "group_alias": "stem",
800
- "dataset_path": "hails/mmlu_no_train",
801
- "dataset_name": "college_physics",
802
- "test_split": "test",
803
- "fewshot_split": "dev",
804
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
805
- "doc_to_target": "answer",
806
- "doc_to_choice": [
807
- "A",
808
- "B",
809
- "C",
810
- "D"
811
- ],
812
- "description": "The following are multiple choice questions (with answers) about college physics.\n\n",
813
- "target_delimiter": " ",
814
- "fewshot_delimiter": "\n\n",
815
- "fewshot_config": {
816
- "sampler": "first_n"
817
- },
818
- "num_fewshot": 5,
819
- "metric_list": [
820
- {
821
- "metric": "acc",
822
- "aggregation": "mean",
823
- "higher_is_better": true
824
- }
825
- ],
826
- "output_type": "multiple_choice",
827
- "repeats": 1,
828
- "should_decontaminate": false,
829
- "metadata": {
830
- "version": 0.0
831
- }
832
- },
833
- "mmlu_computer_security": {
834
- "task": "mmlu_computer_security",
835
- "task_alias": "computer_security",
836
- "group": "mmlu_stem",
837
- "group_alias": "stem",
838
- "dataset_path": "hails/mmlu_no_train",
839
- "dataset_name": "computer_security",
840
- "test_split": "test",
841
- "fewshot_split": "dev",
842
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
843
- "doc_to_target": "answer",
844
- "doc_to_choice": [
845
- "A",
846
- "B",
847
- "C",
848
- "D"
849
- ],
850
- "description": "The following are multiple choice questions (with answers) about computer security.\n\n",
851
- "target_delimiter": " ",
852
- "fewshot_delimiter": "\n\n",
853
- "fewshot_config": {
854
- "sampler": "first_n"
855
- },
856
- "num_fewshot": 5,
857
- "metric_list": [
858
- {
859
- "metric": "acc",
860
- "aggregation": "mean",
861
- "higher_is_better": true
862
- }
863
- ],
864
- "output_type": "multiple_choice",
865
- "repeats": 1,
866
- "should_decontaminate": false,
867
- "metadata": {
868
- "version": 0.0
869
- }
870
- },
871
- "mmlu_conceptual_physics": {
872
- "task": "mmlu_conceptual_physics",
873
- "task_alias": "conceptual_physics",
874
- "group": "mmlu_stem",
875
- "group_alias": "stem",
876
- "dataset_path": "hails/mmlu_no_train",
877
- "dataset_name": "conceptual_physics",
878
- "test_split": "test",
879
- "fewshot_split": "dev",
880
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
881
- "doc_to_target": "answer",
882
- "doc_to_choice": [
883
- "A",
884
- "B",
885
- "C",
886
- "D"
887
- ],
888
- "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
889
- "target_delimiter": " ",
890
- "fewshot_delimiter": "\n\n",
891
- "fewshot_config": {
892
- "sampler": "first_n"
893
- },
894
- "num_fewshot": 5,
895
- "metric_list": [
896
- {
897
- "metric": "acc",
898
- "aggregation": "mean",
899
- "higher_is_better": true
900
- }
901
- ],
902
- "output_type": "multiple_choice",
903
- "repeats": 1,
904
- "should_decontaminate": false,
905
- "metadata": {
906
- "version": 0.0
907
- }
908
- },
909
- "mmlu_econometrics": {
910
- "task": "mmlu_econometrics",
911
- "task_alias": "econometrics",
912
- "group": "mmlu_social_sciences",
913
- "group_alias": "social_sciences",
914
- "dataset_path": "hails/mmlu_no_train",
915
- "dataset_name": "econometrics",
916
- "test_split": "test",
917
- "fewshot_split": "dev",
918
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
919
- "doc_to_target": "answer",
920
- "doc_to_choice": [
921
- "A",
922
- "B",
923
- "C",
924
- "D"
925
- ],
926
- "description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
927
- "target_delimiter": " ",
928
- "fewshot_delimiter": "\n\n",
929
- "fewshot_config": {
930
- "sampler": "first_n"
931
- },
932
- "num_fewshot": 5,
933
- "metric_list": [
934
- {
935
- "metric": "acc",
936
- "aggregation": "mean",
937
- "higher_is_better": true
938
- }
939
- ],
940
- "output_type": "multiple_choice",
941
- "repeats": 1,
942
- "should_decontaminate": false,
943
- "metadata": {
944
- "version": 0.0
945
- }
946
- },
947
- "mmlu_electrical_engineering": {
948
- "task": "mmlu_electrical_engineering",
949
- "task_alias": "electrical_engineering",
950
- "group": "mmlu_stem",
951
- "group_alias": "stem",
952
- "dataset_path": "hails/mmlu_no_train",
953
- "dataset_name": "electrical_engineering",
954
- "test_split": "test",
955
- "fewshot_split": "dev",
956
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
957
- "doc_to_target": "answer",
958
- "doc_to_choice": [
959
- "A",
960
- "B",
961
- "C",
962
- "D"
963
- ],
964
- "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
965
- "target_delimiter": " ",
966
- "fewshot_delimiter": "\n\n",
967
- "fewshot_config": {
968
- "sampler": "first_n"
969
- },
970
- "num_fewshot": 5,
971
- "metric_list": [
972
- {
973
- "metric": "acc",
974
- "aggregation": "mean",
975
- "higher_is_better": true
976
- }
977
- ],
978
- "output_type": "multiple_choice",
979
- "repeats": 1,
980
- "should_decontaminate": false,
981
- "metadata": {
982
- "version": 0.0
983
- }
984
- },
985
- "mmlu_elementary_mathematics": {
986
- "task": "mmlu_elementary_mathematics",
987
- "task_alias": "elementary_mathematics",
988
- "group": "mmlu_stem",
989
- "group_alias": "stem",
990
- "dataset_path": "hails/mmlu_no_train",
991
- "dataset_name": "elementary_mathematics",
992
- "test_split": "test",
993
- "fewshot_split": "dev",
994
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
995
- "doc_to_target": "answer",
996
- "doc_to_choice": [
997
- "A",
998
- "B",
999
- "C",
1000
- "D"
1001
- ],
1002
- "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
1003
- "target_delimiter": " ",
1004
- "fewshot_delimiter": "\n\n",
1005
- "fewshot_config": {
1006
- "sampler": "first_n"
1007
- },
1008
- "num_fewshot": 5,
1009
- "metric_list": [
1010
- {
1011
- "metric": "acc",
1012
- "aggregation": "mean",
1013
- "higher_is_better": true
1014
- }
1015
- ],
1016
- "output_type": "multiple_choice",
1017
- "repeats": 1,
1018
- "should_decontaminate": false,
1019
- "metadata": {
1020
- "version": 0.0
1021
- }
1022
- },
1023
- "mmlu_formal_logic": {
1024
- "task": "mmlu_formal_logic",
1025
- "task_alias": "formal_logic",
1026
- "group": "mmlu_humanities",
1027
- "group_alias": "humanities",
1028
- "dataset_path": "hails/mmlu_no_train",
1029
- "dataset_name": "formal_logic",
1030
- "test_split": "test",
1031
- "fewshot_split": "dev",
1032
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1033
- "doc_to_target": "answer",
1034
- "doc_to_choice": [
1035
- "A",
1036
- "B",
1037
- "C",
1038
- "D"
1039
- ],
1040
- "description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
1041
- "target_delimiter": " ",
1042
- "fewshot_delimiter": "\n\n",
1043
- "fewshot_config": {
1044
- "sampler": "first_n"
1045
- },
1046
- "num_fewshot": 5,
1047
- "metric_list": [
1048
- {
1049
- "metric": "acc",
1050
- "aggregation": "mean",
1051
- "higher_is_better": true
1052
- }
1053
- ],
1054
- "output_type": "multiple_choice",
1055
- "repeats": 1,
1056
- "should_decontaminate": false,
1057
- "metadata": {
1058
- "version": 0.0
1059
- }
1060
- },
1061
- "mmlu_global_facts": {
1062
- "task": "mmlu_global_facts",
1063
- "task_alias": "global_facts",
1064
- "group": "mmlu_other",
1065
- "group_alias": "other",
1066
- "dataset_path": "hails/mmlu_no_train",
1067
- "dataset_name": "global_facts",
1068
- "test_split": "test",
1069
- "fewshot_split": "dev",
1070
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1071
- "doc_to_target": "answer",
1072
- "doc_to_choice": [
1073
- "A",
1074
- "B",
1075
- "C",
1076
- "D"
1077
- ],
1078
- "description": "The following are multiple choice questions (with answers) about global facts.\n\n",
1079
- "target_delimiter": " ",
1080
- "fewshot_delimiter": "\n\n",
1081
- "fewshot_config": {
1082
- "sampler": "first_n"
1083
- },
1084
- "num_fewshot": 5,
1085
- "metric_list": [
1086
- {
1087
- "metric": "acc",
1088
- "aggregation": "mean",
1089
- "higher_is_better": true
1090
- }
1091
- ],
1092
- "output_type": "multiple_choice",
1093
- "repeats": 1,
1094
- "should_decontaminate": false,
1095
- "metadata": {
1096
- "version": 0.0
1097
- }
1098
- },
1099
- "mmlu_high_school_biology": {
1100
- "task": "mmlu_high_school_biology",
1101
- "task_alias": "high_school_biology",
1102
- "group": "mmlu_stem",
1103
- "group_alias": "stem",
1104
- "dataset_path": "hails/mmlu_no_train",
1105
- "dataset_name": "high_school_biology",
1106
- "test_split": "test",
1107
- "fewshot_split": "dev",
1108
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1109
- "doc_to_target": "answer",
1110
- "doc_to_choice": [
1111
- "A",
1112
- "B",
1113
- "C",
1114
- "D"
1115
- ],
1116
- "description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
1117
- "target_delimiter": " ",
1118
- "fewshot_delimiter": "\n\n",
1119
- "fewshot_config": {
1120
- "sampler": "first_n"
1121
- },
1122
- "num_fewshot": 5,
1123
- "metric_list": [
1124
- {
1125
- "metric": "acc",
1126
- "aggregation": "mean",
1127
- "higher_is_better": true
1128
- }
1129
- ],
1130
- "output_type": "multiple_choice",
1131
- "repeats": 1,
1132
- "should_decontaminate": false,
1133
- "metadata": {
1134
- "version": 0.0
1135
- }
1136
- },
1137
- "mmlu_high_school_chemistry": {
1138
- "task": "mmlu_high_school_chemistry",
1139
- "task_alias": "high_school_chemistry",
1140
- "group": "mmlu_stem",
1141
- "group_alias": "stem",
1142
- "dataset_path": "hails/mmlu_no_train",
1143
- "dataset_name": "high_school_chemistry",
1144
- "test_split": "test",
1145
- "fewshot_split": "dev",
1146
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1147
- "doc_to_target": "answer",
1148
- "doc_to_choice": [
1149
- "A",
1150
- "B",
1151
- "C",
1152
- "D"
1153
- ],
1154
- "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
1155
- "target_delimiter": " ",
1156
- "fewshot_delimiter": "\n\n",
1157
- "fewshot_config": {
1158
- "sampler": "first_n"
1159
- },
1160
- "num_fewshot": 5,
1161
- "metric_list": [
1162
- {
1163
- "metric": "acc",
1164
- "aggregation": "mean",
1165
- "higher_is_better": true
1166
- }
1167
- ],
1168
- "output_type": "multiple_choice",
1169
- "repeats": 1,
1170
- "should_decontaminate": false,
1171
- "metadata": {
1172
- "version": 0.0
1173
- }
1174
- },
1175
- "mmlu_high_school_computer_science": {
1176
- "task": "mmlu_high_school_computer_science",
1177
- "task_alias": "high_school_computer_science",
1178
- "group": "mmlu_stem",
1179
- "group_alias": "stem",
1180
- "dataset_path": "hails/mmlu_no_train",
1181
- "dataset_name": "high_school_computer_science",
1182
- "test_split": "test",
1183
- "fewshot_split": "dev",
1184
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1185
- "doc_to_target": "answer",
1186
- "doc_to_choice": [
1187
- "A",
1188
- "B",
1189
- "C",
1190
- "D"
1191
- ],
1192
- "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
1193
- "target_delimiter": " ",
1194
- "fewshot_delimiter": "\n\n",
1195
- "fewshot_config": {
1196
- "sampler": "first_n"
1197
- },
1198
- "num_fewshot": 5,
1199
- "metric_list": [
1200
- {
1201
- "metric": "acc",
1202
- "aggregation": "mean",
1203
- "higher_is_better": true
1204
- }
1205
- ],
1206
- "output_type": "multiple_choice",
1207
- "repeats": 1,
1208
- "should_decontaminate": false,
1209
- "metadata": {
1210
- "version": 0.0
1211
- }
1212
- },
1213
- "mmlu_high_school_european_history": {
1214
- "task": "mmlu_high_school_european_history",
1215
- "task_alias": "high_school_european_history",
1216
- "group": "mmlu_humanities",
1217
- "group_alias": "humanities",
1218
- "dataset_path": "hails/mmlu_no_train",
1219
- "dataset_name": "high_school_european_history",
1220
- "test_split": "test",
1221
- "fewshot_split": "dev",
1222
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1223
- "doc_to_target": "answer",
1224
- "doc_to_choice": [
1225
- "A",
1226
- "B",
1227
- "C",
1228
- "D"
1229
- ],
1230
- "description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
1231
- "target_delimiter": " ",
1232
- "fewshot_delimiter": "\n\n",
1233
- "fewshot_config": {
1234
- "sampler": "first_n"
1235
- },
1236
- "num_fewshot": 5,
1237
- "metric_list": [
1238
- {
1239
- "metric": "acc",
1240
- "aggregation": "mean",
1241
- "higher_is_better": true
1242
- }
1243
- ],
1244
- "output_type": "multiple_choice",
1245
- "repeats": 1,
1246
- "should_decontaminate": false,
1247
- "metadata": {
1248
- "version": 0.0
1249
- }
1250
- },
1251
- "mmlu_high_school_geography": {
1252
- "task": "mmlu_high_school_geography",
1253
- "task_alias": "high_school_geography",
1254
- "group": "mmlu_social_sciences",
1255
- "group_alias": "social_sciences",
1256
- "dataset_path": "hails/mmlu_no_train",
1257
- "dataset_name": "high_school_geography",
1258
- "test_split": "test",
1259
- "fewshot_split": "dev",
1260
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1261
- "doc_to_target": "answer",
1262
- "doc_to_choice": [
1263
- "A",
1264
- "B",
1265
- "C",
1266
- "D"
1267
- ],
1268
- "description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
1269
- "target_delimiter": " ",
1270
- "fewshot_delimiter": "\n\n",
1271
- "fewshot_config": {
1272
- "sampler": "first_n"
1273
- },
1274
- "num_fewshot": 5,
1275
- "metric_list": [
1276
- {
1277
- "metric": "acc",
1278
- "aggregation": "mean",
1279
- "higher_is_better": true
1280
- }
1281
- ],
1282
- "output_type": "multiple_choice",
1283
- "repeats": 1,
1284
- "should_decontaminate": false,
1285
- "metadata": {
1286
- "version": 0.0
1287
- }
1288
- },
1289
- "mmlu_high_school_government_and_politics": {
1290
- "task": "mmlu_high_school_government_and_politics",
1291
- "task_alias": "high_school_government_and_politics",
1292
- "group": "mmlu_social_sciences",
1293
- "group_alias": "social_sciences",
1294
- "dataset_path": "hails/mmlu_no_train",
1295
- "dataset_name": "high_school_government_and_politics",
1296
- "test_split": "test",
1297
- "fewshot_split": "dev",
1298
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1299
- "doc_to_target": "answer",
1300
- "doc_to_choice": [
1301
- "A",
1302
- "B",
1303
- "C",
1304
- "D"
1305
- ],
1306
- "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
1307
- "target_delimiter": " ",
1308
- "fewshot_delimiter": "\n\n",
1309
- "fewshot_config": {
1310
- "sampler": "first_n"
1311
- },
1312
- "num_fewshot": 5,
1313
- "metric_list": [
1314
- {
1315
- "metric": "acc",
1316
- "aggregation": "mean",
1317
- "higher_is_better": true
1318
- }
1319
- ],
1320
- "output_type": "multiple_choice",
1321
- "repeats": 1,
1322
- "should_decontaminate": false,
1323
- "metadata": {
1324
- "version": 0.0
1325
- }
1326
- },
1327
- "mmlu_high_school_macroeconomics": {
1328
- "task": "mmlu_high_school_macroeconomics",
1329
- "task_alias": "high_school_macroeconomics",
1330
- "group": "mmlu_social_sciences",
1331
- "group_alias": "social_sciences",
1332
- "dataset_path": "hails/mmlu_no_train",
1333
- "dataset_name": "high_school_macroeconomics",
1334
- "test_split": "test",
1335
- "fewshot_split": "dev",
1336
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1337
- "doc_to_target": "answer",
1338
- "doc_to_choice": [
1339
- "A",
1340
- "B",
1341
- "C",
1342
- "D"
1343
- ],
1344
- "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
1345
- "target_delimiter": " ",
1346
- "fewshot_delimiter": "\n\n",
1347
- "fewshot_config": {
1348
- "sampler": "first_n"
1349
- },
1350
- "num_fewshot": 5,
1351
- "metric_list": [
1352
- {
1353
- "metric": "acc",
1354
- "aggregation": "mean",
1355
- "higher_is_better": true
1356
- }
1357
- ],
1358
- "output_type": "multiple_choice",
1359
- "repeats": 1,
1360
- "should_decontaminate": false,
1361
- "metadata": {
1362
- "version": 0.0
1363
- }
1364
- },
1365
- "mmlu_high_school_mathematics": {
1366
- "task": "mmlu_high_school_mathematics",
1367
- "task_alias": "high_school_mathematics",
1368
- "group": "mmlu_stem",
1369
- "group_alias": "stem",
1370
- "dataset_path": "hails/mmlu_no_train",
1371
- "dataset_name": "high_school_mathematics",
1372
- "test_split": "test",
1373
- "fewshot_split": "dev",
1374
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1375
- "doc_to_target": "answer",
1376
- "doc_to_choice": [
1377
- "A",
1378
- "B",
1379
- "C",
1380
- "D"
1381
- ],
1382
- "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
1383
- "target_delimiter": " ",
1384
- "fewshot_delimiter": "\n\n",
1385
- "fewshot_config": {
1386
- "sampler": "first_n"
1387
- },
1388
- "num_fewshot": 5,
1389
- "metric_list": [
1390
- {
1391
- "metric": "acc",
1392
- "aggregation": "mean",
1393
- "higher_is_better": true
1394
- }
1395
- ],
1396
- "output_type": "multiple_choice",
1397
- "repeats": 1,
1398
- "should_decontaminate": false,
1399
- "metadata": {
1400
- "version": 0.0
1401
- }
1402
- },
1403
- "mmlu_high_school_microeconomics": {
1404
- "task": "mmlu_high_school_microeconomics",
1405
- "task_alias": "high_school_microeconomics",
1406
- "group": "mmlu_social_sciences",
1407
- "group_alias": "social_sciences",
1408
- "dataset_path": "hails/mmlu_no_train",
1409
- "dataset_name": "high_school_microeconomics",
1410
- "test_split": "test",
1411
- "fewshot_split": "dev",
1412
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1413
- "doc_to_target": "answer",
1414
- "doc_to_choice": [
1415
- "A",
1416
- "B",
1417
- "C",
1418
- "D"
1419
- ],
1420
- "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
1421
- "target_delimiter": " ",
1422
- "fewshot_delimiter": "\n\n",
1423
- "fewshot_config": {
1424
- "sampler": "first_n"
1425
- },
1426
- "num_fewshot": 5,
1427
- "metric_list": [
1428
- {
1429
- "metric": "acc",
1430
- "aggregation": "mean",
1431
- "higher_is_better": true
1432
- }
1433
- ],
1434
- "output_type": "multiple_choice",
1435
- "repeats": 1,
1436
- "should_decontaminate": false,
1437
- "metadata": {
1438
- "version": 0.0
1439
- }
1440
- },
1441
- "mmlu_high_school_physics": {
1442
- "task": "mmlu_high_school_physics",
1443
- "task_alias": "high_school_physics",
1444
- "group": "mmlu_stem",
1445
- "group_alias": "stem",
1446
- "dataset_path": "hails/mmlu_no_train",
1447
- "dataset_name": "high_school_physics",
1448
- "test_split": "test",
1449
- "fewshot_split": "dev",
1450
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1451
- "doc_to_target": "answer",
1452
- "doc_to_choice": [
1453
- "A",
1454
- "B",
1455
- "C",
1456
- "D"
1457
- ],
1458
- "description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
1459
- "target_delimiter": " ",
1460
- "fewshot_delimiter": "\n\n",
1461
- "fewshot_config": {
1462
- "sampler": "first_n"
1463
- },
1464
- "num_fewshot": 5,
1465
- "metric_list": [
1466
- {
1467
- "metric": "acc",
1468
- "aggregation": "mean",
1469
- "higher_is_better": true
1470
- }
1471
- ],
1472
- "output_type": "multiple_choice",
1473
- "repeats": 1,
1474
- "should_decontaminate": false,
1475
- "metadata": {
1476
- "version": 0.0
1477
- }
1478
- },
1479
- "mmlu_high_school_psychology": {
1480
- "task": "mmlu_high_school_psychology",
1481
- "task_alias": "high_school_psychology",
1482
- "group": "mmlu_social_sciences",
1483
- "group_alias": "social_sciences",
1484
- "dataset_path": "hails/mmlu_no_train",
1485
- "dataset_name": "high_school_psychology",
1486
- "test_split": "test",
1487
- "fewshot_split": "dev",
1488
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1489
- "doc_to_target": "answer",
1490
- "doc_to_choice": [
1491
- "A",
1492
- "B",
1493
- "C",
1494
- "D"
1495
- ],
1496
- "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
1497
- "target_delimiter": " ",
1498
- "fewshot_delimiter": "\n\n",
1499
- "fewshot_config": {
1500
- "sampler": "first_n"
1501
- },
1502
- "num_fewshot": 5,
1503
- "metric_list": [
1504
- {
1505
- "metric": "acc",
1506
- "aggregation": "mean",
1507
- "higher_is_better": true
1508
- }
1509
- ],
1510
- "output_type": "multiple_choice",
1511
- "repeats": 1,
1512
- "should_decontaminate": false,
1513
- "metadata": {
1514
- "version": 0.0
1515
- }
1516
- },
1517
- "mmlu_high_school_statistics": {
1518
- "task": "mmlu_high_school_statistics",
1519
- "task_alias": "high_school_statistics",
1520
- "group": "mmlu_stem",
1521
- "group_alias": "stem",
1522
- "dataset_path": "hails/mmlu_no_train",
1523
- "dataset_name": "high_school_statistics",
1524
- "test_split": "test",
1525
- "fewshot_split": "dev",
1526
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1527
- "doc_to_target": "answer",
1528
- "doc_to_choice": [
1529
- "A",
1530
- "B",
1531
- "C",
1532
- "D"
1533
- ],
1534
- "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
1535
- "target_delimiter": " ",
1536
- "fewshot_delimiter": "\n\n",
1537
- "fewshot_config": {
1538
- "sampler": "first_n"
1539
- },
1540
- "num_fewshot": 5,
1541
- "metric_list": [
1542
- {
1543
- "metric": "acc",
1544
- "aggregation": "mean",
1545
- "higher_is_better": true
1546
- }
1547
- ],
1548
- "output_type": "multiple_choice",
1549
- "repeats": 1,
1550
- "should_decontaminate": false,
1551
- "metadata": {
1552
- "version": 0.0
1553
- }
1554
- },
1555
- "mmlu_high_school_us_history": {
1556
- "task": "mmlu_high_school_us_history",
1557
- "task_alias": "high_school_us_history",
1558
- "group": "mmlu_humanities",
1559
- "group_alias": "humanities",
1560
- "dataset_path": "hails/mmlu_no_train",
1561
- "dataset_name": "high_school_us_history",
1562
- "test_split": "test",
1563
- "fewshot_split": "dev",
1564
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1565
- "doc_to_target": "answer",
1566
- "doc_to_choice": [
1567
- "A",
1568
- "B",
1569
- "C",
1570
- "D"
1571
- ],
1572
- "description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
1573
- "target_delimiter": " ",
1574
- "fewshot_delimiter": "\n\n",
1575
- "fewshot_config": {
1576
- "sampler": "first_n"
1577
- },
1578
- "num_fewshot": 5,
1579
- "metric_list": [
1580
- {
1581
- "metric": "acc",
1582
- "aggregation": "mean",
1583
- "higher_is_better": true
1584
- }
1585
- ],
1586
- "output_type": "multiple_choice",
1587
- "repeats": 1,
1588
- "should_decontaminate": false,
1589
- "metadata": {
1590
- "version": 0.0
1591
- }
1592
- },
1593
- "mmlu_high_school_world_history": {
1594
- "task": "mmlu_high_school_world_history",
1595
- "task_alias": "high_school_world_history",
1596
- "group": "mmlu_humanities",
1597
- "group_alias": "humanities",
1598
- "dataset_path": "hails/mmlu_no_train",
1599
- "dataset_name": "high_school_world_history",
1600
- "test_split": "test",
1601
- "fewshot_split": "dev",
1602
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1603
- "doc_to_target": "answer",
1604
- "doc_to_choice": [
1605
- "A",
1606
- "B",
1607
- "C",
1608
- "D"
1609
- ],
1610
- "description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
1611
- "target_delimiter": " ",
1612
- "fewshot_delimiter": "\n\n",
1613
- "fewshot_config": {
1614
- "sampler": "first_n"
1615
- },
1616
- "num_fewshot": 5,
1617
- "metric_list": [
1618
- {
1619
- "metric": "acc",
1620
- "aggregation": "mean",
1621
- "higher_is_better": true
1622
- }
1623
- ],
1624
- "output_type": "multiple_choice",
1625
- "repeats": 1,
1626
- "should_decontaminate": false,
1627
- "metadata": {
1628
- "version": 0.0
1629
- }
1630
- },
1631
- "mmlu_human_aging": {
1632
- "task": "mmlu_human_aging",
1633
- "task_alias": "human_aging",
1634
- "group": "mmlu_other",
1635
- "group_alias": "other",
1636
- "dataset_path": "hails/mmlu_no_train",
1637
- "dataset_name": "human_aging",
1638
- "test_split": "test",
1639
- "fewshot_split": "dev",
1640
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1641
- "doc_to_target": "answer",
1642
- "doc_to_choice": [
1643
- "A",
1644
- "B",
1645
- "C",
1646
- "D"
1647
- ],
1648
- "description": "The following are multiple choice questions (with answers) about human aging.\n\n",
1649
- "target_delimiter": " ",
1650
- "fewshot_delimiter": "\n\n",
1651
- "fewshot_config": {
1652
- "sampler": "first_n"
1653
- },
1654
- "num_fewshot": 5,
1655
- "metric_list": [
1656
- {
1657
- "metric": "acc",
1658
- "aggregation": "mean",
1659
- "higher_is_better": true
1660
- }
1661
- ],
1662
- "output_type": "multiple_choice",
1663
- "repeats": 1,
1664
- "should_decontaminate": false,
1665
- "metadata": {
1666
- "version": 0.0
1667
- }
1668
- },
1669
- "mmlu_human_sexuality": {
1670
- "task": "mmlu_human_sexuality",
1671
- "task_alias": "human_sexuality",
1672
- "group": "mmlu_social_sciences",
1673
- "group_alias": "social_sciences",
1674
- "dataset_path": "hails/mmlu_no_train",
1675
- "dataset_name": "human_sexuality",
1676
- "test_split": "test",
1677
- "fewshot_split": "dev",
1678
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1679
- "doc_to_target": "answer",
1680
- "doc_to_choice": [
1681
- "A",
1682
- "B",
1683
- "C",
1684
- "D"
1685
- ],
1686
- "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
1687
- "target_delimiter": " ",
1688
- "fewshot_delimiter": "\n\n",
1689
- "fewshot_config": {
1690
- "sampler": "first_n"
1691
- },
1692
- "num_fewshot": 5,
1693
- "metric_list": [
1694
- {
1695
- "metric": "acc",
1696
- "aggregation": "mean",
1697
- "higher_is_better": true
1698
- }
1699
- ],
1700
- "output_type": "multiple_choice",
1701
- "repeats": 1,
1702
- "should_decontaminate": false,
1703
- "metadata": {
1704
- "version": 0.0
1705
- }
1706
- },
1707
- "mmlu_international_law": {
1708
- "task": "mmlu_international_law",
1709
- "task_alias": "international_law",
1710
- "group": "mmlu_humanities",
1711
- "group_alias": "humanities",
1712
- "dataset_path": "hails/mmlu_no_train",
1713
- "dataset_name": "international_law",
1714
- "test_split": "test",
1715
- "fewshot_split": "dev",
1716
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1717
- "doc_to_target": "answer",
1718
- "doc_to_choice": [
1719
- "A",
1720
- "B",
1721
- "C",
1722
- "D"
1723
- ],
1724
- "description": "The following are multiple choice questions (with answers) about international law.\n\n",
1725
- "target_delimiter": " ",
1726
- "fewshot_delimiter": "\n\n",
1727
- "fewshot_config": {
1728
- "sampler": "first_n"
1729
- },
1730
- "num_fewshot": 5,
1731
- "metric_list": [
1732
- {
1733
- "metric": "acc",
1734
- "aggregation": "mean",
1735
- "higher_is_better": true
1736
- }
1737
- ],
1738
- "output_type": "multiple_choice",
1739
- "repeats": 1,
1740
- "should_decontaminate": false,
1741
- "metadata": {
1742
- "version": 0.0
1743
- }
1744
- },
1745
- "mmlu_jurisprudence": {
1746
- "task": "mmlu_jurisprudence",
1747
- "task_alias": "jurisprudence",
1748
- "group": "mmlu_humanities",
1749
- "group_alias": "humanities",
1750
- "dataset_path": "hails/mmlu_no_train",
1751
- "dataset_name": "jurisprudence",
1752
- "test_split": "test",
1753
- "fewshot_split": "dev",
1754
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1755
- "doc_to_target": "answer",
1756
- "doc_to_choice": [
1757
- "A",
1758
- "B",
1759
- "C",
1760
- "D"
1761
- ],
1762
- "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
1763
- "target_delimiter": " ",
1764
- "fewshot_delimiter": "\n\n",
1765
- "fewshot_config": {
1766
- "sampler": "first_n"
1767
- },
1768
- "num_fewshot": 5,
1769
- "metric_list": [
1770
- {
1771
- "metric": "acc",
1772
- "aggregation": "mean",
1773
- "higher_is_better": true
1774
- }
1775
- ],
1776
- "output_type": "multiple_choice",
1777
- "repeats": 1,
1778
- "should_decontaminate": false,
1779
- "metadata": {
1780
- "version": 0.0
1781
- }
1782
- },
1783
- "mmlu_logical_fallacies": {
1784
- "task": "mmlu_logical_fallacies",
1785
- "task_alias": "logical_fallacies",
1786
- "group": "mmlu_humanities",
1787
- "group_alias": "humanities",
1788
- "dataset_path": "hails/mmlu_no_train",
1789
- "dataset_name": "logical_fallacies",
1790
- "test_split": "test",
1791
- "fewshot_split": "dev",
1792
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1793
- "doc_to_target": "answer",
1794
- "doc_to_choice": [
1795
- "A",
1796
- "B",
1797
- "C",
1798
- "D"
1799
- ],
1800
- "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
1801
- "target_delimiter": " ",
1802
- "fewshot_delimiter": "\n\n",
1803
- "fewshot_config": {
1804
- "sampler": "first_n"
1805
- },
1806
- "num_fewshot": 5,
1807
- "metric_list": [
1808
- {
1809
- "metric": "acc",
1810
- "aggregation": "mean",
1811
- "higher_is_better": true
1812
- }
1813
- ],
1814
- "output_type": "multiple_choice",
1815
- "repeats": 1,
1816
- "should_decontaminate": false,
1817
- "metadata": {
1818
- "version": 0.0
1819
- }
1820
- },
1821
- "mmlu_machine_learning": {
1822
- "task": "mmlu_machine_learning",
1823
- "task_alias": "machine_learning",
1824
- "group": "mmlu_stem",
1825
- "group_alias": "stem",
1826
- "dataset_path": "hails/mmlu_no_train",
1827
- "dataset_name": "machine_learning",
1828
- "test_split": "test",
1829
- "fewshot_split": "dev",
1830
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1831
- "doc_to_target": "answer",
1832
- "doc_to_choice": [
1833
- "A",
1834
- "B",
1835
- "C",
1836
- "D"
1837
- ],
1838
- "description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
1839
- "target_delimiter": " ",
1840
- "fewshot_delimiter": "\n\n",
1841
- "fewshot_config": {
1842
- "sampler": "first_n"
1843
- },
1844
- "num_fewshot": 5,
1845
- "metric_list": [
1846
- {
1847
- "metric": "acc",
1848
- "aggregation": "mean",
1849
- "higher_is_better": true
1850
- }
1851
- ],
1852
- "output_type": "multiple_choice",
1853
- "repeats": 1,
1854
- "should_decontaminate": false,
1855
- "metadata": {
1856
- "version": 0.0
1857
- }
1858
- },
1859
- "mmlu_management": {
1860
- "task": "mmlu_management",
1861
- "task_alias": "management",
1862
- "group": "mmlu_other",
1863
- "group_alias": "other",
1864
- "dataset_path": "hails/mmlu_no_train",
1865
- "dataset_name": "management",
1866
- "test_split": "test",
1867
- "fewshot_split": "dev",
1868
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1869
- "doc_to_target": "answer",
1870
- "doc_to_choice": [
1871
- "A",
1872
- "B",
1873
- "C",
1874
- "D"
1875
- ],
1876
- "description": "The following are multiple choice questions (with answers) about management.\n\n",
1877
- "target_delimiter": " ",
1878
- "fewshot_delimiter": "\n\n",
1879
- "fewshot_config": {
1880
- "sampler": "first_n"
1881
- },
1882
- "num_fewshot": 5,
1883
- "metric_list": [
1884
- {
1885
- "metric": "acc",
1886
- "aggregation": "mean",
1887
- "higher_is_better": true
1888
- }
1889
- ],
1890
- "output_type": "multiple_choice",
1891
- "repeats": 1,
1892
- "should_decontaminate": false,
1893
- "metadata": {
1894
- "version": 0.0
1895
- }
1896
- },
1897
- "mmlu_marketing": {
1898
- "task": "mmlu_marketing",
1899
- "task_alias": "marketing",
1900
- "group": "mmlu_other",
1901
- "group_alias": "other",
1902
- "dataset_path": "hails/mmlu_no_train",
1903
- "dataset_name": "marketing",
1904
- "test_split": "test",
1905
- "fewshot_split": "dev",
1906
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1907
- "doc_to_target": "answer",
1908
- "doc_to_choice": [
1909
- "A",
1910
- "B",
1911
- "C",
1912
- "D"
1913
- ],
1914
- "description": "The following are multiple choice questions (with answers) about marketing.\n\n",
1915
- "target_delimiter": " ",
1916
- "fewshot_delimiter": "\n\n",
1917
- "fewshot_config": {
1918
- "sampler": "first_n"
1919
- },
1920
- "num_fewshot": 5,
1921
- "metric_list": [
1922
- {
1923
- "metric": "acc",
1924
- "aggregation": "mean",
1925
- "higher_is_better": true
1926
- }
1927
- ],
1928
- "output_type": "multiple_choice",
1929
- "repeats": 1,
1930
- "should_decontaminate": false,
1931
- "metadata": {
1932
- "version": 0.0
1933
- }
1934
- },
1935
- "mmlu_medical_genetics": {
1936
- "task": "mmlu_medical_genetics",
1937
- "task_alias": "medical_genetics",
1938
- "group": "mmlu_other",
1939
- "group_alias": "other",
1940
- "dataset_path": "hails/mmlu_no_train",
1941
- "dataset_name": "medical_genetics",
1942
- "test_split": "test",
1943
- "fewshot_split": "dev",
1944
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1945
- "doc_to_target": "answer",
1946
- "doc_to_choice": [
1947
- "A",
1948
- "B",
1949
- "C",
1950
- "D"
1951
- ],
1952
- "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
1953
- "target_delimiter": " ",
1954
- "fewshot_delimiter": "\n\n",
1955
- "fewshot_config": {
1956
- "sampler": "first_n"
1957
- },
1958
- "num_fewshot": 5,
1959
- "metric_list": [
1960
- {
1961
- "metric": "acc",
1962
- "aggregation": "mean",
1963
- "higher_is_better": true
1964
- }
1965
- ],
1966
- "output_type": "multiple_choice",
1967
- "repeats": 1,
1968
- "should_decontaminate": false,
1969
- "metadata": {
1970
- "version": 0.0
1971
- }
1972
- },
1973
- "mmlu_miscellaneous": {
1974
- "task": "mmlu_miscellaneous",
1975
- "task_alias": "miscellaneous",
1976
- "group": "mmlu_other",
1977
- "group_alias": "other",
1978
- "dataset_path": "hails/mmlu_no_train",
1979
- "dataset_name": "miscellaneous",
1980
- "test_split": "test",
1981
- "fewshot_split": "dev",
1982
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1983
- "doc_to_target": "answer",
1984
- "doc_to_choice": [
1985
- "A",
1986
- "B",
1987
- "C",
1988
- "D"
1989
- ],
1990
- "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
1991
- "target_delimiter": " ",
1992
- "fewshot_delimiter": "\n\n",
1993
- "fewshot_config": {
1994
- "sampler": "first_n"
1995
- },
1996
- "num_fewshot": 5,
1997
- "metric_list": [
1998
- {
1999
- "metric": "acc",
2000
- "aggregation": "mean",
2001
- "higher_is_better": true
2002
- }
2003
- ],
2004
- "output_type": "multiple_choice",
2005
- "repeats": 1,
2006
- "should_decontaminate": false,
2007
- "metadata": {
2008
- "version": 0.0
2009
- }
2010
- },
2011
- "mmlu_moral_disputes": {
2012
- "task": "mmlu_moral_disputes",
2013
- "task_alias": "moral_disputes",
2014
- "group": "mmlu_humanities",
2015
- "group_alias": "humanities",
2016
- "dataset_path": "hails/mmlu_no_train",
2017
- "dataset_name": "moral_disputes",
2018
- "test_split": "test",
2019
- "fewshot_split": "dev",
2020
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2021
- "doc_to_target": "answer",
2022
- "doc_to_choice": [
2023
- "A",
2024
- "B",
2025
- "C",
2026
- "D"
2027
- ],
2028
- "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
2029
- "target_delimiter": " ",
2030
- "fewshot_delimiter": "\n\n",
2031
- "fewshot_config": {
2032
- "sampler": "first_n"
2033
- },
2034
- "num_fewshot": 5,
2035
- "metric_list": [
2036
- {
2037
- "metric": "acc",
2038
- "aggregation": "mean",
2039
- "higher_is_better": true
2040
- }
2041
- ],
2042
- "output_type": "multiple_choice",
2043
- "repeats": 1,
2044
- "should_decontaminate": false,
2045
- "metadata": {
2046
- "version": 0.0
2047
- }
2048
- },
2049
- "mmlu_moral_scenarios": {
2050
- "task": "mmlu_moral_scenarios",
2051
- "task_alias": "moral_scenarios",
2052
- "group": "mmlu_humanities",
2053
- "group_alias": "humanities",
2054
- "dataset_path": "hails/mmlu_no_train",
2055
- "dataset_name": "moral_scenarios",
2056
- "test_split": "test",
2057
- "fewshot_split": "dev",
2058
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2059
- "doc_to_target": "answer",
2060
- "doc_to_choice": [
2061
- "A",
2062
- "B",
2063
- "C",
2064
- "D"
2065
- ],
2066
- "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
2067
- "target_delimiter": " ",
2068
- "fewshot_delimiter": "\n\n",
2069
- "fewshot_config": {
2070
- "sampler": "first_n"
2071
- },
2072
- "num_fewshot": 5,
2073
- "metric_list": [
2074
- {
2075
- "metric": "acc",
2076
- "aggregation": "mean",
2077
- "higher_is_better": true
2078
- }
2079
- ],
2080
- "output_type": "multiple_choice",
2081
- "repeats": 1,
2082
- "should_decontaminate": false,
2083
- "metadata": {
2084
- "version": 0.0
2085
- }
2086
- },
2087
- "mmlu_nutrition": {
2088
- "task": "mmlu_nutrition",
2089
- "task_alias": "nutrition",
2090
- "group": "mmlu_other",
2091
- "group_alias": "other",
2092
- "dataset_path": "hails/mmlu_no_train",
2093
- "dataset_name": "nutrition",
2094
- "test_split": "test",
2095
- "fewshot_split": "dev",
2096
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2097
- "doc_to_target": "answer",
2098
- "doc_to_choice": [
2099
- "A",
2100
- "B",
2101
- "C",
2102
- "D"
2103
- ],
2104
- "description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
2105
- "target_delimiter": " ",
2106
- "fewshot_delimiter": "\n\n",
2107
- "fewshot_config": {
2108
- "sampler": "first_n"
2109
- },
2110
- "num_fewshot": 5,
2111
- "metric_list": [
2112
- {
2113
- "metric": "acc",
2114
- "aggregation": "mean",
2115
- "higher_is_better": true
2116
- }
2117
- ],
2118
- "output_type": "multiple_choice",
2119
- "repeats": 1,
2120
- "should_decontaminate": false,
2121
- "metadata": {
2122
- "version": 0.0
2123
- }
2124
- },
2125
- "mmlu_philosophy": {
2126
- "task": "mmlu_philosophy",
2127
- "task_alias": "philosophy",
2128
- "group": "mmlu_humanities",
2129
- "group_alias": "humanities",
2130
- "dataset_path": "hails/mmlu_no_train",
2131
- "dataset_name": "philosophy",
2132
- "test_split": "test",
2133
- "fewshot_split": "dev",
2134
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2135
- "doc_to_target": "answer",
2136
- "doc_to_choice": [
2137
- "A",
2138
- "B",
2139
- "C",
2140
- "D"
2141
- ],
2142
- "description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
2143
- "target_delimiter": " ",
2144
- "fewshot_delimiter": "\n\n",
2145
- "fewshot_config": {
2146
- "sampler": "first_n"
2147
- },
2148
- "num_fewshot": 5,
2149
- "metric_list": [
2150
- {
2151
- "metric": "acc",
2152
- "aggregation": "mean",
2153
- "higher_is_better": true
2154
- }
2155
- ],
2156
- "output_type": "multiple_choice",
2157
- "repeats": 1,
2158
- "should_decontaminate": false,
2159
- "metadata": {
2160
- "version": 0.0
2161
- }
2162
- },
2163
- "mmlu_prehistory": {
2164
- "task": "mmlu_prehistory",
2165
- "task_alias": "prehistory",
2166
- "group": "mmlu_humanities",
2167
- "group_alias": "humanities",
2168
- "dataset_path": "hails/mmlu_no_train",
2169
- "dataset_name": "prehistory",
2170
- "test_split": "test",
2171
- "fewshot_split": "dev",
2172
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2173
- "doc_to_target": "answer",
2174
- "doc_to_choice": [
2175
- "A",
2176
- "B",
2177
- "C",
2178
- "D"
2179
- ],
2180
- "description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
2181
- "target_delimiter": " ",
2182
- "fewshot_delimiter": "\n\n",
2183
- "fewshot_config": {
2184
- "sampler": "first_n"
2185
- },
2186
- "num_fewshot": 5,
2187
- "metric_list": [
2188
- {
2189
- "metric": "acc",
2190
- "aggregation": "mean",
2191
- "higher_is_better": true
2192
- }
2193
- ],
2194
- "output_type": "multiple_choice",
2195
- "repeats": 1,
2196
- "should_decontaminate": false,
2197
- "metadata": {
2198
- "version": 0.0
2199
- }
2200
- },
2201
- "mmlu_professional_accounting": {
2202
- "task": "mmlu_professional_accounting",
2203
- "task_alias": "professional_accounting",
2204
- "group": "mmlu_other",
2205
- "group_alias": "other",
2206
- "dataset_path": "hails/mmlu_no_train",
2207
- "dataset_name": "professional_accounting",
2208
- "test_split": "test",
2209
- "fewshot_split": "dev",
2210
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2211
- "doc_to_target": "answer",
2212
- "doc_to_choice": [
2213
- "A",
2214
- "B",
2215
- "C",
2216
- "D"
2217
- ],
2218
- "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
2219
- "target_delimiter": " ",
2220
- "fewshot_delimiter": "\n\n",
2221
- "fewshot_config": {
2222
- "sampler": "first_n"
2223
- },
2224
- "num_fewshot": 5,
2225
- "metric_list": [
2226
- {
2227
- "metric": "acc",
2228
- "aggregation": "mean",
2229
- "higher_is_better": true
2230
- }
2231
- ],
2232
- "output_type": "multiple_choice",
2233
- "repeats": 1,
2234
- "should_decontaminate": false,
2235
- "metadata": {
2236
- "version": 0.0
2237
- }
2238
- },
2239
- "mmlu_professional_law": {
2240
- "task": "mmlu_professional_law",
2241
- "task_alias": "professional_law",
2242
- "group": "mmlu_humanities",
2243
- "group_alias": "humanities",
2244
- "dataset_path": "hails/mmlu_no_train",
2245
- "dataset_name": "professional_law",
2246
- "test_split": "test",
2247
- "fewshot_split": "dev",
2248
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2249
- "doc_to_target": "answer",
2250
- "doc_to_choice": [
2251
- "A",
2252
- "B",
2253
- "C",
2254
- "D"
2255
- ],
2256
- "description": "The following are multiple choice questions (with answers) about professional law.\n\n",
2257
- "target_delimiter": " ",
2258
- "fewshot_delimiter": "\n\n",
2259
- "fewshot_config": {
2260
- "sampler": "first_n"
2261
- },
2262
- "num_fewshot": 5,
2263
- "metric_list": [
2264
- {
2265
- "metric": "acc",
2266
- "aggregation": "mean",
2267
- "higher_is_better": true
2268
- }
2269
- ],
2270
- "output_type": "multiple_choice",
2271
- "repeats": 1,
2272
- "should_decontaminate": false,
2273
- "metadata": {
2274
- "version": 0.0
2275
- }
2276
- },
2277
- "mmlu_professional_medicine": {
2278
- "task": "mmlu_professional_medicine",
2279
- "task_alias": "professional_medicine",
2280
- "group": "mmlu_other",
2281
- "group_alias": "other",
2282
- "dataset_path": "hails/mmlu_no_train",
2283
- "dataset_name": "professional_medicine",
2284
- "test_split": "test",
2285
- "fewshot_split": "dev",
2286
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2287
- "doc_to_target": "answer",
2288
- "doc_to_choice": [
2289
- "A",
2290
- "B",
2291
- "C",
2292
- "D"
2293
- ],
2294
- "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
2295
- "target_delimiter": " ",
2296
- "fewshot_delimiter": "\n\n",
2297
- "fewshot_config": {
2298
- "sampler": "first_n"
2299
- },
2300
- "num_fewshot": 5,
2301
- "metric_list": [
2302
- {
2303
- "metric": "acc",
2304
- "aggregation": "mean",
2305
- "higher_is_better": true
2306
- }
2307
- ],
2308
- "output_type": "multiple_choice",
2309
- "repeats": 1,
2310
- "should_decontaminate": false,
2311
- "metadata": {
2312
- "version": 0.0
2313
- }
2314
- },
2315
- "mmlu_professional_psychology": {
2316
- "task": "mmlu_professional_psychology",
2317
- "task_alias": "professional_psychology",
2318
- "group": "mmlu_social_sciences",
2319
- "group_alias": "social_sciences",
2320
- "dataset_path": "hails/mmlu_no_train",
2321
- "dataset_name": "professional_psychology",
2322
- "test_split": "test",
2323
- "fewshot_split": "dev",
2324
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2325
- "doc_to_target": "answer",
2326
- "doc_to_choice": [
2327
- "A",
2328
- "B",
2329
- "C",
2330
- "D"
2331
- ],
2332
- "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
2333
- "target_delimiter": " ",
2334
- "fewshot_delimiter": "\n\n",
2335
- "fewshot_config": {
2336
- "sampler": "first_n"
2337
- },
2338
- "num_fewshot": 5,
2339
- "metric_list": [
2340
- {
2341
- "metric": "acc",
2342
- "aggregation": "mean",
2343
- "higher_is_better": true
2344
- }
2345
- ],
2346
- "output_type": "multiple_choice",
2347
- "repeats": 1,
2348
- "should_decontaminate": false,
2349
- "metadata": {
2350
- "version": 0.0
2351
- }
2352
- },
2353
- "mmlu_public_relations": {
2354
- "task": "mmlu_public_relations",
2355
- "task_alias": "public_relations",
2356
- "group": "mmlu_social_sciences",
2357
- "group_alias": "social_sciences",
2358
- "dataset_path": "hails/mmlu_no_train",
2359
- "dataset_name": "public_relations",
2360
- "test_split": "test",
2361
- "fewshot_split": "dev",
2362
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2363
- "doc_to_target": "answer",
2364
- "doc_to_choice": [
2365
- "A",
2366
- "B",
2367
- "C",
2368
- "D"
2369
- ],
2370
- "description": "The following are multiple choice questions (with answers) about public relations.\n\n",
2371
- "target_delimiter": " ",
2372
- "fewshot_delimiter": "\n\n",
2373
- "fewshot_config": {
2374
- "sampler": "first_n"
2375
- },
2376
- "num_fewshot": 5,
2377
- "metric_list": [
2378
- {
2379
- "metric": "acc",
2380
- "aggregation": "mean",
2381
- "higher_is_better": true
2382
- }
2383
- ],
2384
- "output_type": "multiple_choice",
2385
- "repeats": 1,
2386
- "should_decontaminate": false,
2387
- "metadata": {
2388
- "version": 0.0
2389
- }
2390
- },
2391
- "mmlu_security_studies": {
2392
- "task": "mmlu_security_studies",
2393
- "task_alias": "security_studies",
2394
- "group": "mmlu_social_sciences",
2395
- "group_alias": "social_sciences",
2396
- "dataset_path": "hails/mmlu_no_train",
2397
- "dataset_name": "security_studies",
2398
- "test_split": "test",
2399
- "fewshot_split": "dev",
2400
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2401
- "doc_to_target": "answer",
2402
- "doc_to_choice": [
2403
- "A",
2404
- "B",
2405
- "C",
2406
- "D"
2407
- ],
2408
- "description": "The following are multiple choice questions (with answers) about security studies.\n\n",
2409
- "target_delimiter": " ",
2410
- "fewshot_delimiter": "\n\n",
2411
- "fewshot_config": {
2412
- "sampler": "first_n"
2413
- },
2414
- "num_fewshot": 5,
2415
- "metric_list": [
2416
- {
2417
- "metric": "acc",
2418
- "aggregation": "mean",
2419
- "higher_is_better": true
2420
- }
2421
- ],
2422
- "output_type": "multiple_choice",
2423
- "repeats": 1,
2424
- "should_decontaminate": false,
2425
- "metadata": {
2426
- "version": 0.0
2427
- }
2428
- },
2429
- "mmlu_sociology": {
2430
- "task": "mmlu_sociology",
2431
- "task_alias": "sociology",
2432
- "group": "mmlu_social_sciences",
2433
- "group_alias": "social_sciences",
2434
- "dataset_path": "hails/mmlu_no_train",
2435
- "dataset_name": "sociology",
2436
- "test_split": "test",
2437
- "fewshot_split": "dev",
2438
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2439
- "doc_to_target": "answer",
2440
- "doc_to_choice": [
2441
- "A",
2442
- "B",
2443
- "C",
2444
- "D"
2445
- ],
2446
- "description": "The following are multiple choice questions (with answers) about sociology.\n\n",
2447
- "target_delimiter": " ",
2448
- "fewshot_delimiter": "\n\n",
2449
- "fewshot_config": {
2450
- "sampler": "first_n"
2451
- },
2452
- "num_fewshot": 5,
2453
- "metric_list": [
2454
- {
2455
- "metric": "acc",
2456
- "aggregation": "mean",
2457
- "higher_is_better": true
2458
- }
2459
- ],
2460
- "output_type": "multiple_choice",
2461
- "repeats": 1,
2462
- "should_decontaminate": false,
2463
- "metadata": {
2464
- "version": 0.0
2465
- }
2466
- },
2467
- "mmlu_us_foreign_policy": {
2468
- "task": "mmlu_us_foreign_policy",
2469
- "task_alias": "us_foreign_policy",
2470
- "group": "mmlu_social_sciences",
2471
- "group_alias": "social_sciences",
2472
- "dataset_path": "hails/mmlu_no_train",
2473
- "dataset_name": "us_foreign_policy",
2474
- "test_split": "test",
2475
- "fewshot_split": "dev",
2476
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2477
- "doc_to_target": "answer",
2478
- "doc_to_choice": [
2479
- "A",
2480
- "B",
2481
- "C",
2482
- "D"
2483
- ],
2484
- "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
2485
- "target_delimiter": " ",
2486
- "fewshot_delimiter": "\n\n",
2487
- "fewshot_config": {
2488
- "sampler": "first_n"
2489
- },
2490
- "num_fewshot": 5,
2491
- "metric_list": [
2492
- {
2493
- "metric": "acc",
2494
- "aggregation": "mean",
2495
- "higher_is_better": true
2496
- }
2497
- ],
2498
- "output_type": "multiple_choice",
2499
- "repeats": 1,
2500
- "should_decontaminate": false,
2501
- "metadata": {
2502
- "version": 0.0
2503
- }
2504
- },
2505
- "mmlu_virology": {
2506
- "task": "mmlu_virology",
2507
- "task_alias": "virology",
2508
- "group": "mmlu_other",
2509
- "group_alias": "other",
2510
- "dataset_path": "hails/mmlu_no_train",
2511
- "dataset_name": "virology",
2512
- "test_split": "test",
2513
- "fewshot_split": "dev",
2514
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2515
- "doc_to_target": "answer",
2516
- "doc_to_choice": [
2517
- "A",
2518
- "B",
2519
- "C",
2520
- "D"
2521
- ],
2522
- "description": "The following are multiple choice questions (with answers) about virology.\n\n",
2523
- "target_delimiter": " ",
2524
- "fewshot_delimiter": "\n\n",
2525
- "fewshot_config": {
2526
- "sampler": "first_n"
2527
- },
2528
- "num_fewshot": 5,
2529
- "metric_list": [
2530
- {
2531
- "metric": "acc",
2532
- "aggregation": "mean",
2533
- "higher_is_better": true
2534
- }
2535
- ],
2536
- "output_type": "multiple_choice",
2537
- "repeats": 1,
2538
- "should_decontaminate": false,
2539
- "metadata": {
2540
- "version": 0.0
2541
- }
2542
- },
2543
- "mmlu_world_religions": {
2544
- "task": "mmlu_world_religions",
2545
- "task_alias": "world_religions",
2546
- "group": "mmlu_humanities",
2547
- "group_alias": "humanities",
2548
- "dataset_path": "hails/mmlu_no_train",
2549
- "dataset_name": "world_religions",
2550
- "test_split": "test",
2551
- "fewshot_split": "dev",
2552
- "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2553
- "doc_to_target": "answer",
2554
- "doc_to_choice": [
2555
- "A",
2556
- "B",
2557
- "C",
2558
- "D"
2559
- ],
2560
- "description": "The following are multiple choice questions (with answers) about world religions.\n\n",
2561
- "target_delimiter": " ",
2562
- "fewshot_delimiter": "\n\n",
2563
- "fewshot_config": {
2564
- "sampler": "first_n"
2565
- },
2566
- "num_fewshot": 5,
2567
- "metric_list": [
2568
- {
2569
- "metric": "acc",
2570
- "aggregation": "mean",
2571
- "higher_is_better": true
2572
- }
2573
- ],
2574
- "output_type": "multiple_choice",
2575
- "repeats": 1,
2576
- "should_decontaminate": false,
2577
- "metadata": {
2578
- "version": 0.0
2579
- }
2580
- }
2581
- },
2582
- "versions": {
2583
- "mmlu_abstract_algebra": 0.0,
2584
- "mmlu_anatomy": 0.0,
2585
- "mmlu_astronomy": 0.0,
2586
- "mmlu_business_ethics": 0.0,
2587
- "mmlu_clinical_knowledge": 0.0,
2588
- "mmlu_college_biology": 0.0,
2589
- "mmlu_college_chemistry": 0.0,
2590
- "mmlu_college_computer_science": 0.0,
2591
- "mmlu_college_mathematics": 0.0,
2592
- "mmlu_college_medicine": 0.0,
2593
- "mmlu_college_physics": 0.0,
2594
- "mmlu_computer_security": 0.0,
2595
- "mmlu_conceptual_physics": 0.0,
2596
- "mmlu_econometrics": 0.0,
2597
- "mmlu_electrical_engineering": 0.0,
2598
- "mmlu_elementary_mathematics": 0.0,
2599
- "mmlu_formal_logic": 0.0,
2600
- "mmlu_global_facts": 0.0,
2601
- "mmlu_high_school_biology": 0.0,
2602
- "mmlu_high_school_chemistry": 0.0,
2603
- "mmlu_high_school_computer_science": 0.0,
2604
- "mmlu_high_school_european_history": 0.0,
2605
- "mmlu_high_school_geography": 0.0,
2606
- "mmlu_high_school_government_and_politics": 0.0,
2607
- "mmlu_high_school_macroeconomics": 0.0,
2608
- "mmlu_high_school_mathematics": 0.0,
2609
- "mmlu_high_school_microeconomics": 0.0,
2610
- "mmlu_high_school_physics": 0.0,
2611
- "mmlu_high_school_psychology": 0.0,
2612
- "mmlu_high_school_statistics": 0.0,
2613
- "mmlu_high_school_us_history": 0.0,
2614
- "mmlu_high_school_world_history": 0.0,
2615
- "mmlu_human_aging": 0.0,
2616
- "mmlu_human_sexuality": 0.0,
2617
- "mmlu_international_law": 0.0,
2618
- "mmlu_jurisprudence": 0.0,
2619
- "mmlu_logical_fallacies": 0.0,
2620
- "mmlu_machine_learning": 0.0,
2621
- "mmlu_management": 0.0,
2622
- "mmlu_marketing": 0.0,
2623
- "mmlu_medical_genetics": 0.0,
2624
- "mmlu_miscellaneous": 0.0,
2625
- "mmlu_moral_disputes": 0.0,
2626
- "mmlu_moral_scenarios": 0.0,
2627
- "mmlu_nutrition": 0.0,
2628
- "mmlu_philosophy": 0.0,
2629
- "mmlu_prehistory": 0.0,
2630
- "mmlu_professional_accounting": 0.0,
2631
- "mmlu_professional_law": 0.0,
2632
- "mmlu_professional_medicine": 0.0,
2633
- "mmlu_professional_psychology": 0.0,
2634
- "mmlu_public_relations": 0.0,
2635
- "mmlu_security_studies": 0.0,
2636
- "mmlu_sociology": 0.0,
2637
- "mmlu_us_foreign_policy": 0.0,
2638
- "mmlu_virology": 0.0,
2639
- "mmlu_world_religions": 0.0
2640
- },
2641
- "n-shot": {
2642
- "mmlu": 0,
2643
- "mmlu_abstract_algebra": 5,
2644
- "mmlu_anatomy": 5,
2645
- "mmlu_astronomy": 5,
2646
- "mmlu_business_ethics": 5,
2647
- "mmlu_clinical_knowledge": 5,
2648
- "mmlu_college_biology": 5,
2649
- "mmlu_college_chemistry": 5,
2650
- "mmlu_college_computer_science": 5,
2651
- "mmlu_college_mathematics": 5,
2652
- "mmlu_college_medicine": 5,
2653
- "mmlu_college_physics": 5,
2654
- "mmlu_computer_security": 5,
2655
- "mmlu_conceptual_physics": 5,
2656
- "mmlu_econometrics": 5,
2657
- "mmlu_electrical_engineering": 5,
2658
- "mmlu_elementary_mathematics": 5,
2659
- "mmlu_formal_logic": 5,
2660
- "mmlu_global_facts": 5,
2661
- "mmlu_high_school_biology": 5,
2662
- "mmlu_high_school_chemistry": 5,
2663
- "mmlu_high_school_computer_science": 5,
2664
- "mmlu_high_school_european_history": 5,
2665
- "mmlu_high_school_geography": 5,
2666
- "mmlu_high_school_government_and_politics": 5,
2667
- "mmlu_high_school_macroeconomics": 5,
2668
- "mmlu_high_school_mathematics": 5,
2669
- "mmlu_high_school_microeconomics": 5,
2670
- "mmlu_high_school_physics": 5,
2671
- "mmlu_high_school_psychology": 5,
2672
- "mmlu_high_school_statistics": 5,
2673
- "mmlu_high_school_us_history": 5,
2674
- "mmlu_high_school_world_history": 5,
2675
- "mmlu_human_aging": 5,
2676
- "mmlu_human_sexuality": 5,
2677
- "mmlu_humanities": 5,
2678
- "mmlu_international_law": 5,
2679
- "mmlu_jurisprudence": 5,
2680
- "mmlu_logical_fallacies": 5,
2681
- "mmlu_machine_learning": 5,
2682
- "mmlu_management": 5,
2683
- "mmlu_marketing": 5,
2684
- "mmlu_medical_genetics": 5,
2685
- "mmlu_miscellaneous": 5,
2686
- "mmlu_moral_disputes": 5,
2687
- "mmlu_moral_scenarios": 5,
2688
- "mmlu_nutrition": 5,
2689
- "mmlu_other": 5,
2690
- "mmlu_philosophy": 5,
2691
- "mmlu_prehistory": 5,
2692
- "mmlu_professional_accounting": 5,
2693
- "mmlu_professional_law": 5,
2694
- "mmlu_professional_medicine": 5,
2695
- "mmlu_professional_psychology": 5,
2696
- "mmlu_public_relations": 5,
2697
- "mmlu_security_studies": 5,
2698
- "mmlu_social_sciences": 5,
2699
- "mmlu_sociology": 5,
2700
- "mmlu_stem": 5,
2701
- "mmlu_us_foreign_policy": 5,
2702
- "mmlu_virology": 5,
2703
- "mmlu_world_religions": 5
2704
- },
2705
- "higher_is_better": {
2706
- "mmlu": {
2707
- "acc": true
2708
- },
2709
- "mmlu_abstract_algebra": {
2710
- "acc": true
2711
- },
2712
- "mmlu_anatomy": {
2713
- "acc": true
2714
- },
2715
- "mmlu_astronomy": {
2716
- "acc": true
2717
- },
2718
- "mmlu_business_ethics": {
2719
- "acc": true
2720
- },
2721
- "mmlu_clinical_knowledge": {
2722
- "acc": true
2723
- },
2724
- "mmlu_college_biology": {
2725
- "acc": true
2726
- },
2727
- "mmlu_college_chemistry": {
2728
- "acc": true
2729
- },
2730
- "mmlu_college_computer_science": {
2731
- "acc": true
2732
- },
2733
- "mmlu_college_mathematics": {
2734
- "acc": true
2735
- },
2736
- "mmlu_college_medicine": {
2737
- "acc": true
2738
- },
2739
- "mmlu_college_physics": {
2740
- "acc": true
2741
- },
2742
- "mmlu_computer_security": {
2743
- "acc": true
2744
- },
2745
- "mmlu_conceptual_physics": {
2746
- "acc": true
2747
- },
2748
- "mmlu_econometrics": {
2749
- "acc": true
2750
- },
2751
- "mmlu_electrical_engineering": {
2752
- "acc": true
2753
- },
2754
- "mmlu_elementary_mathematics": {
2755
- "acc": true
2756
- },
2757
- "mmlu_formal_logic": {
2758
- "acc": true
2759
- },
2760
- "mmlu_global_facts": {
2761
- "acc": true
2762
- },
2763
- "mmlu_high_school_biology": {
2764
- "acc": true
2765
- },
2766
- "mmlu_high_school_chemistry": {
2767
- "acc": true
2768
- },
2769
- "mmlu_high_school_computer_science": {
2770
- "acc": true
2771
- },
2772
- "mmlu_high_school_european_history": {
2773
- "acc": true
2774
- },
2775
- "mmlu_high_school_geography": {
2776
- "acc": true
2777
- },
2778
- "mmlu_high_school_government_and_politics": {
2779
- "acc": true
2780
- },
2781
- "mmlu_high_school_macroeconomics": {
2782
- "acc": true
2783
- },
2784
- "mmlu_high_school_mathematics": {
2785
- "acc": true
2786
- },
2787
- "mmlu_high_school_microeconomics": {
2788
- "acc": true
2789
- },
2790
- "mmlu_high_school_physics": {
2791
- "acc": true
2792
- },
2793
- "mmlu_high_school_psychology": {
2794
- "acc": true
2795
- },
2796
- "mmlu_high_school_statistics": {
2797
- "acc": true
2798
- },
2799
- "mmlu_high_school_us_history": {
2800
- "acc": true
2801
- },
2802
- "mmlu_high_school_world_history": {
2803
- "acc": true
2804
- },
2805
- "mmlu_human_aging": {
2806
- "acc": true
2807
- },
2808
- "mmlu_human_sexuality": {
2809
- "acc": true
2810
- },
2811
- "mmlu_humanities": {
2812
- "acc": true
2813
- },
2814
- "mmlu_international_law": {
2815
- "acc": true
2816
- },
2817
- "mmlu_jurisprudence": {
2818
- "acc": true
2819
- },
2820
- "mmlu_logical_fallacies": {
2821
- "acc": true
2822
- },
2823
- "mmlu_machine_learning": {
2824
- "acc": true
2825
- },
2826
- "mmlu_management": {
2827
- "acc": true
2828
- },
2829
- "mmlu_marketing": {
2830
- "acc": true
2831
- },
2832
- "mmlu_medical_genetics": {
2833
- "acc": true
2834
- },
2835
- "mmlu_miscellaneous": {
2836
- "acc": true
2837
- },
2838
- "mmlu_moral_disputes": {
2839
- "acc": true
2840
- },
2841
- "mmlu_moral_scenarios": {
2842
- "acc": true
2843
- },
2844
- "mmlu_nutrition": {
2845
- "acc": true
2846
- },
2847
- "mmlu_other": {
2848
- "acc": true
2849
- },
2850
- "mmlu_philosophy": {
2851
- "acc": true
2852
- },
2853
- "mmlu_prehistory": {
2854
- "acc": true
2855
- },
2856
- "mmlu_professional_accounting": {
2857
- "acc": true
2858
- },
2859
- "mmlu_professional_law": {
2860
- "acc": true
2861
- },
2862
- "mmlu_professional_medicine": {
2863
- "acc": true
2864
- },
2865
- "mmlu_professional_psychology": {
2866
- "acc": true
2867
- },
2868
- "mmlu_public_relations": {
2869
- "acc": true
2870
- },
2871
- "mmlu_security_studies": {
2872
- "acc": true
2873
- },
2874
- "mmlu_social_sciences": {
2875
- "acc": true
2876
- },
2877
- "mmlu_sociology": {
2878
- "acc": true
2879
- },
2880
- "mmlu_stem": {
2881
- "acc": true
2882
- },
2883
- "mmlu_us_foreign_policy": {
2884
- "acc": true
2885
- },
2886
- "mmlu_virology": {
2887
- "acc": true
2888
- },
2889
- "mmlu_world_religions": {
2890
- "acc": true
2891
- }
2892
- },
2893
- "n-samples": {
2894
- "mmlu_moral_scenarios": {
2895
- "original": 895,
2896
- "effective": 895
2897
- },
2898
- "mmlu_high_school_us_history": {
2899
- "original": 204,
2900
- "effective": 204
2901
- },
2902
- "mmlu_high_school_world_history": {
2903
- "original": 237,
2904
- "effective": 237
2905
- },
2906
- "mmlu_world_religions": {
2907
- "original": 171,
2908
- "effective": 171
2909
- },
2910
- "mmlu_formal_logic": {
2911
- "original": 126,
2912
- "effective": 126
2913
- },
2914
- "mmlu_moral_disputes": {
2915
- "original": 346,
2916
- "effective": 346
2917
- },
2918
- "mmlu_prehistory": {
2919
- "original": 324,
2920
- "effective": 324
2921
- },
2922
- "mmlu_international_law": {
2923
- "original": 121,
2924
- "effective": 121
2925
- },
2926
- "mmlu_logical_fallacies": {
2927
- "original": 163,
2928
- "effective": 163
2929
- },
2930
- "mmlu_professional_law": {
2931
- "original": 1534,
2932
- "effective": 1534
2933
- },
2934
- "mmlu_philosophy": {
2935
- "original": 311,
2936
- "effective": 311
2937
- },
2938
- "mmlu_high_school_european_history": {
2939
- "original": 165,
2940
- "effective": 165
2941
- },
2942
- "mmlu_jurisprudence": {
2943
- "original": 108,
2944
- "effective": 108
2945
- },
2946
- "mmlu_high_school_psychology": {
2947
- "original": 545,
2948
- "effective": 545
2949
- },
2950
- "mmlu_high_school_geography": {
2951
- "original": 198,
2952
- "effective": 198
2953
- },
2954
- "mmlu_high_school_macroeconomics": {
2955
- "original": 390,
2956
- "effective": 390
2957
- },
2958
- "mmlu_public_relations": {
2959
- "original": 110,
2960
- "effective": 110
2961
- },
2962
- "mmlu_security_studies": {
2963
- "original": 245,
2964
- "effective": 245
2965
- },
2966
- "mmlu_high_school_microeconomics": {
2967
- "original": 238,
2968
- "effective": 238
2969
- },
2970
- "mmlu_human_sexuality": {
2971
- "original": 131,
2972
- "effective": 131
2973
- },
2974
- "mmlu_sociology": {
2975
- "original": 201,
2976
- "effective": 201
2977
- },
2978
- "mmlu_professional_psychology": {
2979
- "original": 612,
2980
- "effective": 612
2981
- },
2982
- "mmlu_econometrics": {
2983
- "original": 114,
2984
- "effective": 114
2985
- },
2986
- "mmlu_us_foreign_policy": {
2987
- "original": 100,
2988
- "effective": 100
2989
- },
2990
- "mmlu_high_school_government_and_politics": {
2991
- "original": 193,
2992
- "effective": 193
2993
- },
2994
- "mmlu_marketing": {
2995
- "original": 234,
2996
- "effective": 234
2997
- },
2998
- "mmlu_professional_accounting": {
2999
- "original": 282,
3000
- "effective": 282
3001
- },
3002
- "mmlu_clinical_knowledge": {
3003
- "original": 265,
3004
- "effective": 265
3005
- },
3006
- "mmlu_college_medicine": {
3007
- "original": 173,
3008
- "effective": 173
3009
- },
3010
- "mmlu_miscellaneous": {
3011
- "original": 783,
3012
- "effective": 783
3013
- },
3014
- "mmlu_virology": {
3015
- "original": 166,
3016
- "effective": 166
3017
- },
3018
- "mmlu_business_ethics": {
3019
- "original": 100,
3020
- "effective": 100
3021
- },
3022
- "mmlu_professional_medicine": {
3023
- "original": 272,
3024
- "effective": 272
3025
- },
3026
- "mmlu_global_facts": {
3027
- "original": 100,
3028
- "effective": 100
3029
- },
3030
- "mmlu_nutrition": {
3031
- "original": 306,
3032
- "effective": 306
3033
- },
3034
- "mmlu_human_aging": {
3035
- "original": 223,
3036
- "effective": 223
3037
- },
3038
- "mmlu_management": {
3039
- "original": 103,
3040
- "effective": 103
3041
- },
3042
- "mmlu_medical_genetics": {
3043
- "original": 100,
3044
- "effective": 100
3045
- },
3046
- "mmlu_college_biology": {
3047
- "original": 144,
3048
- "effective": 144
3049
- },
3050
- "mmlu_high_school_computer_science": {
3051
- "original": 100,
3052
- "effective": 100
3053
- },
3054
- "mmlu_elementary_mathematics": {
3055
- "original": 378,
3056
- "effective": 378
3057
- },
3058
- "mmlu_astronomy": {
3059
- "original": 152,
3060
- "effective": 152
3061
- },
3062
- "mmlu_machine_learning": {
3063
- "original": 112,
3064
- "effective": 112
3065
- },
3066
- "mmlu_high_school_mathematics": {
3067
- "original": 270,
3068
- "effective": 270
3069
- },
3070
- "mmlu_electrical_engineering": {
3071
- "original": 145,
3072
- "effective": 145
3073
- },
3074
- "mmlu_college_chemistry": {
3075
- "original": 100,
3076
- "effective": 100
3077
- },
3078
- "mmlu_college_mathematics": {
3079
- "original": 100,
3080
- "effective": 100
3081
- },
3082
- "mmlu_high_school_statistics": {
3083
- "original": 216,
3084
- "effective": 216
3085
- },
3086
- "mmlu_high_school_biology": {
3087
- "original": 310,
3088
- "effective": 310
3089
- },
3090
- "mmlu_abstract_algebra": {
3091
- "original": 100,
3092
- "effective": 100
3093
- },
3094
- "mmlu_college_physics": {
3095
- "original": 102,
3096
- "effective": 102
3097
- },
3098
- "mmlu_conceptual_physics": {
3099
- "original": 235,
3100
- "effective": 235
3101
- },
3102
- "mmlu_computer_security": {
3103
- "original": 100,
3104
- "effective": 100
3105
- },
3106
- "mmlu_anatomy": {
3107
- "original": 135,
3108
- "effective": 135
3109
- },
3110
- "mmlu_college_computer_science": {
3111
- "original": 100,
3112
- "effective": 100
3113
- },
3114
- "mmlu_high_school_physics": {
3115
- "original": 151,
3116
- "effective": 151
3117
- },
3118
- "mmlu_high_school_chemistry": {
3119
- "original": 203,
3120
- "effective": 203
3121
- }
3122
- },
3123
- "config": {
3124
- "model": "hf",
3125
- "model_args": "pretrained=/home/migel/Tess-v2.5-qwen2-72B-safetensors,parallelize=True",
3126
- "model_num_parameters": 72706203648,
3127
- "model_dtype": "torch.float16",
3128
- "model_revision": "main",
3129
- "model_sha": "",
3130
- "batch_size": "8",
3131
- "batch_sizes": [],
3132
- "device": null,
3133
- "use_cache": null,
3134
- "limit": null,
3135
- "bootstrap_iters": 100000,
3136
- "gen_kwargs": null,
3137
- "random_seed": 0,
3138
- "numpy_seed": 1234,
3139
- "torch_seed": 1234,
3140
- "fewshot_seed": 1234
3141
- },
3142
- "git_hash": "b3e4c49a",
3143
- "date": 1718167288.656124,
3144
- "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.31\n\nPython version: 3.10.14 (main, Apr 6 2024, 18:45:05) [GCC 9.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1050-azure-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\nGPU 2: NVIDIA A100 80GB PCIe\nGPU 3: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 530.30.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 48 bits physical, 48 bits virtual\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nNUMA node(s): 4\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7V13 64-Core Processor\nStepping: 1\nCPU MHz: 2445.435\nBogoMIPS: 4890.87\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB\nL1i cache: 3 MiB\nL2 cache: 48 MiB\nL3 cache: 384 MiB\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] magma-cuda117 2.6.1 1 pytorch\n[conda] mkl 2022.2.1 pypi_0 pypi\n[conda] mkl-include 2022.2.1 pypi_0 pypi\n[conda] numpy 1.24.4 pypi_0 pypi\n[conda] pytorch-lightning 1.9.5 pypi_0 pypi\n[conda] torch 2.0.1 pypi_0 pypi\n[conda] torch-nebula 0.16.10 pypi_0 pypi\n[conda] torch-ort 1.17.0 pypi_0 pypi\n[conda] torchaudio 2.0.2+cu117 pypi_0 pypi\n[conda] torchdata 0.6.1 pypi_0 pypi\n[conda] torchmetrics 1.2.0 pypi_0 pypi\n[conda] torchsnapshot 0.1.0 pypi_0 pypi\n[conda] torchvision 0.15.2+cu117 pypi_0 pypi\n[conda] triton 2.0.0 pypi_0 pypi",
3145
- "transformers_version": "4.41.1",
3146
- "upper_git_hash": null,
3147
- "task_hashes": {},
3148
- "model_source": "hf",
3149
- "model_name": "/home/migel/Tess-v2.5-qwen2-72B-safetensors",
3150
- "model_name_sanitized": "__home__migel__Tess-v2.5-qwen2-72B-safetensors",
3151
- "system_instruction": null,
3152
- "system_instruction_sha": null,
3153
- "chat_template": null,
3154
- "chat_template_sha": null,
3155
- "start_time": 380863.826540975,
3156
- "end_time": 388726.503174757,
3157
- "total_evaluation_time_seconds": "7862.676633781986"
3158
- }