ToluClassics commited on
Commit
6a732ce
1 Parent(s): 4a7f4b8

add run files

Browse files
Files changed (41) hide show
  1. app.py +1 -4
  2. evals/afrimmlu_direct/afrimmlu_direct_amh-Meta-Llama-3-8B-Instruct.json +23 -21
  3. evals/afrimmlu_direct/afrimmlu_direct_amh-afriteva_v2_large_ayaft.json +25 -0
  4. evals/afrimmlu_direct/afrimmlu_direct_eng-Meta-Llama-3-8B-Instruct.json +25 -0
  5. evals/afrimmlu_direct/afrimmlu_direct_eng-afriteva_v2_large_ayaft.json +25 -0
  6. evals/afrimmlu_direct/afrimmlu_direct_ewe-Meta-Llama-3-8B-Instruct.json +25 -0
  7. evals/afrimmlu_direct/afrimmlu_direct_ewe-afriteva_v2_large_ayaft.json +25 -0
  8. evals/afrimmlu_direct/afrimmlu_direct_fra-Meta-Llama-3-8B-Instruct.json +25 -0
  9. evals/afrimmlu_direct/afrimmlu_direct_fra-afriteva_v2_large_ayaft.json +25 -0
  10. evals/afrimmlu_direct/afrimmlu_direct_hau-Meta-Llama-3-8B-Instruct.json +25 -0
  11. evals/afrimmlu_direct/afrimmlu_direct_hau-afriteva_v2_large_ayaft.json +25 -0
  12. evals/afrimmlu_direct/afrimmlu_direct_ibo-Meta-Llama-3-8B-Instruct.json +25 -0
  13. evals/afrimmlu_direct/afrimmlu_direct_ibo-afriteva_v2_large_ayaft.json +25 -0
  14. evals/afrimmlu_direct/afrimmlu_direct_kin-Meta-Llama-3-8B-Instruct.json +25 -0
  15. evals/afrimmlu_direct/afrimmlu_direct_kin-afriteva_v2_large_ayaft.json +25 -0
  16. evals/afrimmlu_direct/afrimmlu_direct_lin-Meta-Llama-3-8B-Instruct.json +25 -0
  17. evals/afrimmlu_direct/afrimmlu_direct_lin-afriteva_v2_large_ayaft.json +25 -0
  18. evals/afrimmlu_direct/afrimmlu_direct_lug-Meta-Llama-3-8B-Instruct.json +25 -0
  19. evals/afrimmlu_direct/afrimmlu_direct_lug-afriteva_v2_large_ayaft.json +25 -0
  20. evals/afrimmlu_direct/afrimmlu_direct_orm-Meta-Llama-3-8B-Instruct.json +25 -0
  21. evals/afrimmlu_direct/afrimmlu_direct_orm-afriteva_v2_large_ayaft.json +25 -0
  22. evals/afrimmlu_direct/afrimmlu_direct_sna-Meta-Llama-3-8B-Instruct.json +25 -0
  23. evals/afrimmlu_direct/afrimmlu_direct_sna-afriteva_v2_large_ayaft.json +25 -0
  24. evals/afrimmlu_direct/afrimmlu_direct_sot-Meta-Llama-3-8B-Instruct.json +25 -0
  25. evals/afrimmlu_direct/afrimmlu_direct_sot-afriteva_v2_large_ayaft.json +25 -0
  26. evals/afrimmlu_direct/afrimmlu_direct_swa-Meta-Llama-3-8B-Instruct.json +25 -0
  27. evals/afrimmlu_direct/afrimmlu_direct_swa-afriteva_v2_large_ayaft.json +25 -0
  28. evals/afrimmlu_direct/afrimmlu_direct_twi-Meta-Llama-3-8B-Instruct.json +25 -0
  29. evals/afrimmlu_direct/afrimmlu_direct_twi-afriteva_v2_large_ayaft.json +25 -0
  30. evals/afrimmlu_direct/afrimmlu_direct_wol-Meta-Llama-3-8B-Instruct.json +25 -0
  31. evals/afrimmlu_direct/afrimmlu_direct_wol-afriteva_v2_large_ayaft.json +25 -0
  32. evals/afrimmlu_direct/afrimmlu_direct_xho-Meta-Llama-3-8B-Instruct.json +25 -0
  33. evals/afrimmlu_direct/afrimmlu_direct_xho-afriteva_v2_large_ayaft.json +25 -0
  34. evals/afrimmlu_direct/afrimmlu_direct_yor-Meta-Llama-3-8B-Instruct.json +25 -0
  35. evals/afrimmlu_direct/afrimmlu_direct_yor-afriteva_v2_large_ayaft.json +25 -0
  36. evals/afrimmlu_direct/afrimmlu_direct_zul-Meta-Llama-3-8B-Instruct.json +25 -0
  37. evals/afrimmlu_direct/afrimmlu_direct_zul-afriteva_v2_large_ayaft.json +25 -0
  38. evals/afrimmlu_translate/afrimmlu_translate_amh-Meta-Llama-3-8B-Instruct.json +0 -23
  39. evals/afrixnli_direct/afrixnli_direct_amh-Meta-Llama-3-8B-Instruct.json +0 -23
  40. evals/afrixnli_translate/afrixnli_translate_amh-Meta-Llama-3-8B-Instruct.json +0 -23
  41. pull_benchmark_data.py +69 -0
app.py CHANGED
@@ -16,7 +16,7 @@ AFRIXNLI_TRANSLATE = "afrixnli_translate"
16
 
17
  BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE]
18
 
19
- METRICS = ["acc_norm", "acc_norm", "acc_norm"]
20
 
21
  LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
22
 
@@ -66,8 +66,6 @@ def collect_results():
66
  pretrained_models.add(pretrained)
67
 
68
  for lang_task, perfs in results.items():
69
- print(lang_task)
70
- print(perfs)
71
  lang_task = lang_task.split('_')
72
  lang = lang_task[-1]
73
  task = '_'.join(lang_task[:-1])
@@ -75,7 +73,6 @@ def collect_results():
75
  assert task in BENCHMARKS
76
 
77
  if lang and task:
78
- print(BENCHMARKS.index(task))
79
  metric = METRICS[BENCHMARKS.index(task)-1]
80
  p = round(perfs[metric] * 100, 1)
81
  performance_dict[(pretrained, lang)][task] = p
 
16
 
17
  BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE]
18
 
19
+ METRICS = ["acc", "acc_stderr", "f1"]
20
 
21
  LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
22
 
 
66
  pretrained_models.add(pretrained)
67
 
68
  for lang_task, perfs in results.items():
 
 
69
  lang_task = lang_task.split('_')
70
  lang = lang_task[-1]
71
  task = '_'.join(lang_task[:-1])
 
73
  assert task in BENCHMARKS
74
 
75
  if lang and task:
 
76
  metric = METRICS[BENCHMARKS.index(task)-1]
77
  p = round(perfs[metric] * 100, 1)
78
  performance_dict[(pretrained, lang)][task] = p
evals/afrimmlu_direct/afrimmlu_direct_amh-Meta-Llama-3-8B-Instruct.json CHANGED
@@ -1,23 +1,25 @@
1
  {
2
- "results": {
3
- "afrimmlu_direct_amh": {
4
- "acc": 0.2634730538922156,
5
- "acc_stderr": 0.012889646336321774,
6
- "acc_norm": 0.31394354148845166,
7
- "acc_norm_stderr": 0.013579515768185788
8
- }
9
- },
10
- "versions": {
11
- "afrimmlu_direct_amh": 0
12
- },
13
- "config": {
14
- "model": "hf",
15
- "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
  }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "results": {
3
+ "afrimmlu_direct_amh": {
4
+ "acc": 0.294,
5
+ "acc_stderr": 0.02039509548493662,
6
+ "f1": 0.2746897269174172,
7
+ "f1_stderr": "N/A"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_amh": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_amh-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_amh": {
4
+ "acc": 0.198,
5
+ "acc_stderr": 0.01783895896384723,
6
+ "f1": 0.17909485060936567,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_amh": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_eng-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_eng": {
4
+ "acc": 0.54,
5
+ "acc_stderr": 0.022311333245289663,
6
+ "f1": 0.5382451731126852,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_eng": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_eng-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_eng": {
4
+ "acc": 0.202,
5
+ "acc_stderr": 0.017973260031288258,
6
+ "f1": 0.1810217396082914,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_eng": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_ewe-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_ewe": {
4
+ "acc": 0.256,
5
+ "acc_stderr": 0.019536923574747615,
6
+ "f1": 0.25092528422459087,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_ewe": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_ewe-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_ewe": {
4
+ "acc": 0.216,
5
+ "acc_stderr": 0.018421909061411935,
6
+ "f1": 0.2090649866201732,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_ewe": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_fra-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_fra": {
4
+ "acc": 0.52,
5
+ "acc_stderr": 0.02236516042423133,
6
+ "f1": 0.5208080267558528,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_fra": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_fra-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_fra": {
4
+ "acc": 0.214,
5
+ "acc_stderr": 0.01835979750238705,
6
+ "f1": 0.19892885512438008,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_fra": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_hau-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_hau": {
4
+ "acc": 0.284,
5
+ "acc_stderr": 0.02018670369357086,
6
+ "f1": 0.28010976427568457,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_hau": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_hau-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_hau": {
4
+ "acc": 0.198,
5
+ "acc_stderr": 0.01783895896384723,
6
+ "f1": 0.18262451497980564,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_hau": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_ibo-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_ibo": {
4
+ "acc": 0.33,
5
+ "acc_stderr": 0.02104961216613481,
6
+ "f1": 0.3245851895177857,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_ibo": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_ibo-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_ibo": {
4
+ "acc": 0.216,
5
+ "acc_stderr": 0.018421909061411935,
6
+ "f1": 0.19962216343021852,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_ibo": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_kin-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_kin": {
4
+ "acc": 0.296,
5
+ "acc_stderr": 0.020435342091896132,
6
+ "f1": 0.28675065419514817,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_kin": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_kin-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_kin": {
4
+ "acc": 0.196,
5
+ "acc_stderr": 0.017770751227744862,
6
+ "f1": 0.1757764061374706,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_kin": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_lin-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_lin": {
4
+ "acc": 0.322,
5
+ "acc_stderr": 0.02091666833001988,
6
+ "f1": 0.32402662210255406,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_lin": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_lin-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_lin": {
4
+ "acc": 0.206,
5
+ "acc_stderr": 0.018104794037333578,
6
+ "f1": 0.1892508672233748,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_lin": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_lug-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_lug": {
4
+ "acc": 0.274,
5
+ "acc_stderr": 0.019966103540279462,
6
+ "f1": 0.26961388059171515,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_lug": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_lug-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_lug": {
4
+ "acc": 0.186,
5
+ "acc_stderr": 0.01741880678058393,
6
+ "f1": 0.1659062113821138,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_lug": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_orm-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_orm": {
4
+ "acc": 0.29,
5
+ "acc_stderr": 0.02031317923174519,
6
+ "f1": 0.27651312244521153,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_orm": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_orm-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_orm": {
4
+ "acc": 0.196,
5
+ "acc_stderr": 0.017770751227744862,
6
+ "f1": 0.1850540063965828,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_orm": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_sna-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_sna": {
4
+ "acc": 0.318,
5
+ "acc_stderr": 0.020847571620814003,
6
+ "f1": 0.3122451614597504,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_sna": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_sna-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_sna": {
4
+ "acc": 0.202,
5
+ "acc_stderr": 0.017973260031288272,
6
+ "f1": 0.18269228929246092,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_sna": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_sot-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_sot": {
4
+ "acc": 0.3,
5
+ "acc_stderr": 0.020514426225628057,
6
+ "f1": 0.2929658538402751,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_sot": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_sot-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_sot": {
4
+ "acc": 0.17,
5
+ "acc_stderr": 0.01681563353139343,
6
+ "f1": 0.14781723073525851,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_sot": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_swa-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_swa": {
4
+ "acc": 0.338,
5
+ "acc_stderr": 0.02117566569520941,
6
+ "f1": 0.32951382759209963,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_swa": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_swa-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_swa": {
4
+ "acc": 0.2,
5
+ "acc_stderr": 0.01790645924143387,
6
+ "f1": 0.1770651960776954,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_swa": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_twi-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_twi": {
4
+ "acc": 0.298,
5
+ "acc_stderr": 0.020475118092988964,
6
+ "f1": 0.2871954716168561,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_twi": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_twi-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_twi": {
4
+ "acc": 0.236,
5
+ "acc_stderr": 0.01900869962208472,
6
+ "f1": 0.22301842452462128,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_twi": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_wol-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_wol": {
4
+ "acc": 0.294,
5
+ "acc_stderr": 0.02039509548493661,
6
+ "f1": 0.28966157375341367,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_wol": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_wol-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_wol": {
4
+ "acc": 0.21,
5
+ "acc_stderr": 0.018233620865305916,
6
+ "f1": 0.19254414015593024,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_wol": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_xho-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_xho": {
4
+ "acc": 0.28,
5
+ "acc_stderr": 0.020099950647503237,
6
+ "f1": 0.2719845000284934,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_xho": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_xho-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_xho": {
4
+ "acc": 0.214,
5
+ "acc_stderr": 0.01835979750238703,
6
+ "f1": 0.1970288455340002,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_xho": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_yor-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_yor": {
4
+ "acc": 0.284,
5
+ "acc_stderr": 0.02018670369357086,
6
+ "f1": 0.2753758430338612,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_yor": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_yor-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_yor": {
4
+ "acc": 0.21,
5
+ "acc_stderr": 0.018233620865305916,
6
+ "f1": 0.1953916524581152,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_yor": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_zul-Meta-Llama-3-8B-Instruct.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_zul": {
4
+ "acc": 0.288,
5
+ "acc_stderr": 0.020271503835075217,
6
+ "f1": 0.2879464935505291,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_zul": 1.0,
12
+ "wandb_run_name": "hopeful-vortex-13"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.bfloat16",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_direct/afrimmlu_direct_zul-afriteva_v2_large_ayaft.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "afrimmlu_direct_zul": {
4
+ "acc": 0.21,
5
+ "acc_stderr": 0.018233620865305916,
6
+ "f1": 0.1859078146090128,
7
+ "f1_stderr": "N/A"
8
+ }
9
+ },
10
+ "versions": {
11
+ "afrimmlu_direct_zul": 1.0,
12
+ "wandb_run_name": "cool-river-12"
13
+ },
14
+ "config": {
15
+ "model": "hf",
16
+ "model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
17
+ "batch_size": 8,
18
+ "device": null,
19
+ "model_dtype": "torch.float32",
20
+ "numpy_seed": 42,
21
+ "torch_seed": 42,
22
+ "random_seed": 42,
23
+ "fewshot_seed": 42
24
+ }
25
+ }
evals/afrimmlu_translate/afrimmlu_translate_amh-Meta-Llama-3-8B-Instruct.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "afrimmlu_translate_amh": {
4
- "acc": 0.2634730538922156,
5
- "acc_stderr": 0.012889646336321774,
6
- "acc_norm": 0.31394354148845166,
7
- "acc_norm_stderr": 0.013579515768185788
8
- }
9
- },
10
- "versions": {
11
- "afrimmlu_translate_amh": 0
12
- },
13
- "config": {
14
- "model": "hf",
15
- "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/afrixnli_direct/afrixnli_direct_amh-Meta-Llama-3-8B-Instruct.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "afrixnli_direct_amh": {
4
- "acc": 0.2634730538922156,
5
- "acc_stderr": 0.012889646336321774,
6
- "acc_norm": 0.31394354148845166,
7
- "acc_norm_stderr": 0.013579515768185788
8
- }
9
- },
10
- "versions": {
11
- "afrixnli_direct_amh": 0
12
- },
13
- "config": {
14
- "model": "hf",
15
- "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/afrixnli_translate/afrixnli_translate_amh-Meta-Llama-3-8B-Instruct.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "afrixnli_translate_amh": {
4
- "acc": 0.2634730538922156,
5
- "acc_stderr": 0.012889646336321774,
6
- "acc_norm": 0.31394354148845166,
7
- "acc_norm_stderr": 0.013579515768185788
8
- }
9
- },
10
- "versions": {
11
- "afrixnli_translate_amh": 0
12
- },
13
- "config": {
14
- "model": "hf",
15
- "model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
16
- "batch_size": 8,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pull_benchmark_data.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import wandb
4
+ import argparse
5
+
6
+ curr_dir = os.path.dirname(os.path.realpath(__file__))
7
+
8
+ LANGUAGES = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
9
+ BENCHMARCK2PROJECT = {
10
+ "afrimmlu_direct" : "african-research-collective/llm-evaluation-afrimmlu-direct"
11
+ }
12
+ BENCHMARK_VERSION = 1.0
13
+
14
+
15
+
16
+ def main(args):
17
+ api = wandb.Api()
18
+
19
+ runs = api.runs(BENCHMARCK2PROJECT[args.benchmark])
20
+
21
+ print(runs)
22
+
23
+ for run in runs:
24
+ # .summary contains the output keys/values for metrics like accuracy.
25
+ # We call ._json_dict to omit large files
26
+
27
+ for lang in LANGUAGES:
28
+ lang_result_key = f'{args.benchmark}_{lang}'
29
+
30
+ results = {lang_result_key: {}}
31
+ config = {}
32
+ versions = {}
33
+
34
+
35
+ results[lang_result_key]['acc'] = run.summary._json_dict[f'{lang_result_key}/acc']
36
+ results[lang_result_key]['acc_stderr'] = run.summary._json_dict[f'{lang_result_key}/acc_stderr']
37
+ results[lang_result_key]['f1'] = run.summary._json_dict[f'{lang_result_key}/f1']
38
+ results[lang_result_key]['f1_stderr'] = run.summary._json_dict[f'{lang_result_key}/f1_stderr']
39
+
40
+ versions[lang_result_key] = BENCHMARK_VERSION
41
+ versions['wandb_run_name'] = run.name
42
+
43
+ config['model'] = run.config['cli_configs']['model']
44
+ config['model_args'] = run.config['cli_configs']['model_args']
45
+ config['batch_size'] = run.config['cli_configs']['batch_size']
46
+ config['device'] = run.config['cli_configs']['device']
47
+ config['model_dtype'] = run.config['cli_configs']['model_dtype']
48
+ config['numpy_seed'] = run.config['cli_configs']['numpy_seed']
49
+ config['torch_seed'] = run.config['cli_configs']['torch_seed']
50
+ config['random_seed'] = run.config['cli_configs']['random_seed']
51
+ config['fewshot_seed'] = run.config['cli_configs']['fewshot_seed']
52
+
53
+ final_json_object = {
54
+ 'results': results,
55
+ 'versions': versions,
56
+ 'config': config
57
+ }
58
+
59
+ pretrained_model = config['model_args'].split(',')[0].split('=')[1].split('/')[-1]
60
+
61
+ with open(os.path.join(curr_dir, f"evals/{args.benchmark}/{args.benchmark}_{lang}-{pretrained_model}.json"), 'w') as f:
62
+ json.dump(final_json_object, f, indent=2)
63
+
64
+
65
+ if __name__ == '__main__':
66
+ parser = argparse.ArgumentParser()
67
+ parser.add_argument('--benchmark', type=str, required=True)
68
+ args = parser.parse_args()
69
+ main(args)