ToluClassics
commited on
Commit
•
6a732ce
1
Parent(s):
4a7f4b8
add run files
Browse files- app.py +1 -4
- evals/afrimmlu_direct/afrimmlu_direct_amh-Meta-Llama-3-8B-Instruct.json +23 -21
- evals/afrimmlu_direct/afrimmlu_direct_amh-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_eng-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_eng-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_ewe-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_ewe-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_fra-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_fra-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_hau-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_hau-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_ibo-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_ibo-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_kin-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_kin-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_lin-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_lin-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_lug-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_lug-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_orm-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_orm-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_sna-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_sna-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_sot-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_sot-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_swa-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_swa-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_twi-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_twi-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_wol-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_wol-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_xho-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_xho-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_yor-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_yor-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_zul-Meta-Llama-3-8B-Instruct.json +25 -0
- evals/afrimmlu_direct/afrimmlu_direct_zul-afriteva_v2_large_ayaft.json +25 -0
- evals/afrimmlu_translate/afrimmlu_translate_amh-Meta-Llama-3-8B-Instruct.json +0 -23
- evals/afrixnli_direct/afrixnli_direct_amh-Meta-Llama-3-8B-Instruct.json +0 -23
- evals/afrixnli_translate/afrixnli_translate_amh-Meta-Llama-3-8B-Instruct.json +0 -23
- pull_benchmark_data.py +69 -0
app.py
CHANGED
@@ -16,7 +16,7 @@ AFRIXNLI_TRANSLATE = "afrixnli_translate"
|
|
16 |
|
17 |
BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE]
|
18 |
|
19 |
-
METRICS = ["
|
20 |
|
21 |
LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
|
22 |
|
@@ -66,8 +66,6 @@ def collect_results():
|
|
66 |
pretrained_models.add(pretrained)
|
67 |
|
68 |
for lang_task, perfs in results.items():
|
69 |
-
print(lang_task)
|
70 |
-
print(perfs)
|
71 |
lang_task = lang_task.split('_')
|
72 |
lang = lang_task[-1]
|
73 |
task = '_'.join(lang_task[:-1])
|
@@ -75,7 +73,6 @@ def collect_results():
|
|
75 |
assert task in BENCHMARKS
|
76 |
|
77 |
if lang and task:
|
78 |
-
print(BENCHMARKS.index(task))
|
79 |
metric = METRICS[BENCHMARKS.index(task)-1]
|
80 |
p = round(perfs[metric] * 100, 1)
|
81 |
performance_dict[(pretrained, lang)][task] = p
|
|
|
16 |
|
17 |
BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE]
|
18 |
|
19 |
+
METRICS = ["acc", "acc_stderr", "f1"]
|
20 |
|
21 |
LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
|
22 |
|
|
|
66 |
pretrained_models.add(pretrained)
|
67 |
|
68 |
for lang_task, perfs in results.items():
|
|
|
|
|
69 |
lang_task = lang_task.split('_')
|
70 |
lang = lang_task[-1]
|
71 |
task = '_'.join(lang_task[:-1])
|
|
|
73 |
assert task in BENCHMARKS
|
74 |
|
75 |
if lang and task:
|
|
|
76 |
metric = METRICS[BENCHMARKS.index(task)-1]
|
77 |
p = round(perfs[metric] * 100, 1)
|
78 |
performance_dict[(pretrained, lang)][task] = p
|
evals/afrimmlu_direct/afrimmlu_direct_amh-Meta-Llama-3-8B-Instruct.json
CHANGED
@@ -1,23 +1,25 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"afrimmlu_direct_amh": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf",
|
15 |
-
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_amh": {
|
4 |
+
"acc": 0.294,
|
5 |
+
"acc_stderr": 0.02039509548493662,
|
6 |
+
"f1": 0.2746897269174172,
|
7 |
+
"f1_stderr": "N/A"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_amh": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_amh-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_amh": {
|
4 |
+
"acc": 0.198,
|
5 |
+
"acc_stderr": 0.01783895896384723,
|
6 |
+
"f1": 0.17909485060936567,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_amh": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_eng-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_eng": {
|
4 |
+
"acc": 0.54,
|
5 |
+
"acc_stderr": 0.022311333245289663,
|
6 |
+
"f1": 0.5382451731126852,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_eng": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_eng-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_eng": {
|
4 |
+
"acc": 0.202,
|
5 |
+
"acc_stderr": 0.017973260031288258,
|
6 |
+
"f1": 0.1810217396082914,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_eng": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_ewe-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_ewe": {
|
4 |
+
"acc": 0.256,
|
5 |
+
"acc_stderr": 0.019536923574747615,
|
6 |
+
"f1": 0.25092528422459087,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_ewe": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_ewe-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_ewe": {
|
4 |
+
"acc": 0.216,
|
5 |
+
"acc_stderr": 0.018421909061411935,
|
6 |
+
"f1": 0.2090649866201732,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_ewe": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_fra-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_fra": {
|
4 |
+
"acc": 0.52,
|
5 |
+
"acc_stderr": 0.02236516042423133,
|
6 |
+
"f1": 0.5208080267558528,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_fra": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_fra-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_fra": {
|
4 |
+
"acc": 0.214,
|
5 |
+
"acc_stderr": 0.01835979750238705,
|
6 |
+
"f1": 0.19892885512438008,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_fra": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_hau-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_hau": {
|
4 |
+
"acc": 0.284,
|
5 |
+
"acc_stderr": 0.02018670369357086,
|
6 |
+
"f1": 0.28010976427568457,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_hau": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_hau-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_hau": {
|
4 |
+
"acc": 0.198,
|
5 |
+
"acc_stderr": 0.01783895896384723,
|
6 |
+
"f1": 0.18262451497980564,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_hau": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_ibo-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_ibo": {
|
4 |
+
"acc": 0.33,
|
5 |
+
"acc_stderr": 0.02104961216613481,
|
6 |
+
"f1": 0.3245851895177857,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_ibo": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_ibo-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_ibo": {
|
4 |
+
"acc": 0.216,
|
5 |
+
"acc_stderr": 0.018421909061411935,
|
6 |
+
"f1": 0.19962216343021852,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_ibo": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_kin-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_kin": {
|
4 |
+
"acc": 0.296,
|
5 |
+
"acc_stderr": 0.020435342091896132,
|
6 |
+
"f1": 0.28675065419514817,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_kin": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_kin-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_kin": {
|
4 |
+
"acc": 0.196,
|
5 |
+
"acc_stderr": 0.017770751227744862,
|
6 |
+
"f1": 0.1757764061374706,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_kin": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_lin-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_lin": {
|
4 |
+
"acc": 0.322,
|
5 |
+
"acc_stderr": 0.02091666833001988,
|
6 |
+
"f1": 0.32402662210255406,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_lin": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_lin-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_lin": {
|
4 |
+
"acc": 0.206,
|
5 |
+
"acc_stderr": 0.018104794037333578,
|
6 |
+
"f1": 0.1892508672233748,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_lin": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_lug-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_lug": {
|
4 |
+
"acc": 0.274,
|
5 |
+
"acc_stderr": 0.019966103540279462,
|
6 |
+
"f1": 0.26961388059171515,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_lug": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_lug-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_lug": {
|
4 |
+
"acc": 0.186,
|
5 |
+
"acc_stderr": 0.01741880678058393,
|
6 |
+
"f1": 0.1659062113821138,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_lug": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_orm-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_orm": {
|
4 |
+
"acc": 0.29,
|
5 |
+
"acc_stderr": 0.02031317923174519,
|
6 |
+
"f1": 0.27651312244521153,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_orm": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_orm-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_orm": {
|
4 |
+
"acc": 0.196,
|
5 |
+
"acc_stderr": 0.017770751227744862,
|
6 |
+
"f1": 0.1850540063965828,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_orm": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_sna-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_sna": {
|
4 |
+
"acc": 0.318,
|
5 |
+
"acc_stderr": 0.020847571620814003,
|
6 |
+
"f1": 0.3122451614597504,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_sna": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_sna-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_sna": {
|
4 |
+
"acc": 0.202,
|
5 |
+
"acc_stderr": 0.017973260031288272,
|
6 |
+
"f1": 0.18269228929246092,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_sna": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_sot-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_sot": {
|
4 |
+
"acc": 0.3,
|
5 |
+
"acc_stderr": 0.020514426225628057,
|
6 |
+
"f1": 0.2929658538402751,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_sot": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_sot-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_sot": {
|
4 |
+
"acc": 0.17,
|
5 |
+
"acc_stderr": 0.01681563353139343,
|
6 |
+
"f1": 0.14781723073525851,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_sot": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_swa-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_swa": {
|
4 |
+
"acc": 0.338,
|
5 |
+
"acc_stderr": 0.02117566569520941,
|
6 |
+
"f1": 0.32951382759209963,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_swa": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_swa-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_swa": {
|
4 |
+
"acc": 0.2,
|
5 |
+
"acc_stderr": 0.01790645924143387,
|
6 |
+
"f1": 0.1770651960776954,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_swa": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_twi-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_twi": {
|
4 |
+
"acc": 0.298,
|
5 |
+
"acc_stderr": 0.020475118092988964,
|
6 |
+
"f1": 0.2871954716168561,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_twi": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_twi-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_twi": {
|
4 |
+
"acc": 0.236,
|
5 |
+
"acc_stderr": 0.01900869962208472,
|
6 |
+
"f1": 0.22301842452462128,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_twi": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_wol-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_wol": {
|
4 |
+
"acc": 0.294,
|
5 |
+
"acc_stderr": 0.02039509548493661,
|
6 |
+
"f1": 0.28966157375341367,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_wol": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_wol-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_wol": {
|
4 |
+
"acc": 0.21,
|
5 |
+
"acc_stderr": 0.018233620865305916,
|
6 |
+
"f1": 0.19254414015593024,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_wol": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_xho-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_xho": {
|
4 |
+
"acc": 0.28,
|
5 |
+
"acc_stderr": 0.020099950647503237,
|
6 |
+
"f1": 0.2719845000284934,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_xho": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_xho-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_xho": {
|
4 |
+
"acc": 0.214,
|
5 |
+
"acc_stderr": 0.01835979750238703,
|
6 |
+
"f1": 0.1970288455340002,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_xho": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_yor-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_yor": {
|
4 |
+
"acc": 0.284,
|
5 |
+
"acc_stderr": 0.02018670369357086,
|
6 |
+
"f1": 0.2753758430338612,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_yor": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_yor-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_yor": {
|
4 |
+
"acc": 0.21,
|
5 |
+
"acc_stderr": 0.018233620865305916,
|
6 |
+
"f1": 0.1953916524581152,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_yor": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_zul-Meta-Llama-3-8B-Instruct.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_zul": {
|
4 |
+
"acc": 0.288,
|
5 |
+
"acc_stderr": 0.020271503835075217,
|
6 |
+
"f1": 0.2879464935505291,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_zul": 1.0,
|
12 |
+
"wandb_run_name": "hopeful-vortex-13"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.bfloat16",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_direct/afrimmlu_direct_zul-afriteva_v2_large_ayaft.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"afrimmlu_direct_zul": {
|
4 |
+
"acc": 0.21,
|
5 |
+
"acc_stderr": 0.018233620865305916,
|
6 |
+
"f1": 0.1859078146090128,
|
7 |
+
"f1_stderr": "N/A"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"afrimmlu_direct_zul": 1.0,
|
12 |
+
"wandb_run_name": "cool-river-12"
|
13 |
+
},
|
14 |
+
"config": {
|
15 |
+
"model": "hf",
|
16 |
+
"model_args": "pretrained=taresco/afriteva_v2_large_ayaft",
|
17 |
+
"batch_size": 8,
|
18 |
+
"device": null,
|
19 |
+
"model_dtype": "torch.float32",
|
20 |
+
"numpy_seed": 42,
|
21 |
+
"torch_seed": 42,
|
22 |
+
"random_seed": 42,
|
23 |
+
"fewshot_seed": 42
|
24 |
+
}
|
25 |
+
}
|
evals/afrimmlu_translate/afrimmlu_translate_amh-Meta-Llama-3-8B-Instruct.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"afrimmlu_translate_amh": {
|
4 |
-
"acc": 0.2634730538922156,
|
5 |
-
"acc_stderr": 0.012889646336321774,
|
6 |
-
"acc_norm": 0.31394354148845166,
|
7 |
-
"acc_norm_stderr": 0.013579515768185788
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"afrimmlu_translate_amh": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf",
|
15 |
-
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/afrixnli_direct/afrixnli_direct_amh-Meta-Llama-3-8B-Instruct.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"afrixnli_direct_amh": {
|
4 |
-
"acc": 0.2634730538922156,
|
5 |
-
"acc_stderr": 0.012889646336321774,
|
6 |
-
"acc_norm": 0.31394354148845166,
|
7 |
-
"acc_norm_stderr": 0.013579515768185788
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"afrixnli_direct_amh": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf",
|
15 |
-
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/afrixnli_translate/afrixnli_translate_amh-Meta-Llama-3-8B-Instruct.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"afrixnli_translate_amh": {
|
4 |
-
"acc": 0.2634730538922156,
|
5 |
-
"acc_stderr": 0.012889646336321774,
|
6 |
-
"acc_norm": 0.31394354148845166,
|
7 |
-
"acc_norm_stderr": 0.013579515768185788
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"afrixnli_translate_amh": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf",
|
15 |
-
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B-Instruct",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pull_benchmark_data.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import wandb
|
4 |
+
import argparse
|
5 |
+
|
6 |
+
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
7 |
+
|
8 |
+
LANGUAGES = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
|
9 |
+
BENCHMARCK2PROJECT = {
|
10 |
+
"afrimmlu_direct" : "african-research-collective/llm-evaluation-afrimmlu-direct"
|
11 |
+
}
|
12 |
+
BENCHMARK_VERSION = 1.0
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
def main(args):
|
17 |
+
api = wandb.Api()
|
18 |
+
|
19 |
+
runs = api.runs(BENCHMARCK2PROJECT[args.benchmark])
|
20 |
+
|
21 |
+
print(runs)
|
22 |
+
|
23 |
+
for run in runs:
|
24 |
+
# .summary contains the output keys/values for metrics like accuracy.
|
25 |
+
# We call ._json_dict to omit large files
|
26 |
+
|
27 |
+
for lang in LANGUAGES:
|
28 |
+
lang_result_key = f'{args.benchmark}_{lang}'
|
29 |
+
|
30 |
+
results = {lang_result_key: {}}
|
31 |
+
config = {}
|
32 |
+
versions = {}
|
33 |
+
|
34 |
+
|
35 |
+
results[lang_result_key]['acc'] = run.summary._json_dict[f'{lang_result_key}/acc']
|
36 |
+
results[lang_result_key]['acc_stderr'] = run.summary._json_dict[f'{lang_result_key}/acc_stderr']
|
37 |
+
results[lang_result_key]['f1'] = run.summary._json_dict[f'{lang_result_key}/f1']
|
38 |
+
results[lang_result_key]['f1_stderr'] = run.summary._json_dict[f'{lang_result_key}/f1_stderr']
|
39 |
+
|
40 |
+
versions[lang_result_key] = BENCHMARK_VERSION
|
41 |
+
versions['wandb_run_name'] = run.name
|
42 |
+
|
43 |
+
config['model'] = run.config['cli_configs']['model']
|
44 |
+
config['model_args'] = run.config['cli_configs']['model_args']
|
45 |
+
config['batch_size'] = run.config['cli_configs']['batch_size']
|
46 |
+
config['device'] = run.config['cli_configs']['device']
|
47 |
+
config['model_dtype'] = run.config['cli_configs']['model_dtype']
|
48 |
+
config['numpy_seed'] = run.config['cli_configs']['numpy_seed']
|
49 |
+
config['torch_seed'] = run.config['cli_configs']['torch_seed']
|
50 |
+
config['random_seed'] = run.config['cli_configs']['random_seed']
|
51 |
+
config['fewshot_seed'] = run.config['cli_configs']['fewshot_seed']
|
52 |
+
|
53 |
+
final_json_object = {
|
54 |
+
'results': results,
|
55 |
+
'versions': versions,
|
56 |
+
'config': config
|
57 |
+
}
|
58 |
+
|
59 |
+
pretrained_model = config['model_args'].split(',')[0].split('=')[1].split('/')[-1]
|
60 |
+
|
61 |
+
with open(os.path.join(curr_dir, f"evals/{args.benchmark}/{args.benchmark}_{lang}-{pretrained_model}.json"), 'w') as f:
|
62 |
+
json.dump(final_json_object, f, indent=2)
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == '__main__':
|
66 |
+
parser = argparse.ArgumentParser()
|
67 |
+
parser.add_argument('--benchmark', type=str, required=True)
|
68 |
+
args = parser.parse_args()
|
69 |
+
main(args)
|