Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
•
47c3ae2
1
Parent(s):
9185c5b
update with repo links
Browse files- app.py +8 -4
- content.py +6 -11
app.py
CHANGED
@@ -17,8 +17,11 @@ from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CHANGE
|
|
17 |
|
18 |
BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
|
19 |
|
20 |
-
OWNER="
|
|
|
21 |
SUBMISSION_DATASET = f"{OWNER}/submissions"
|
|
|
|
|
22 |
|
23 |
SPLIT="validation" #Change to test once we are ready to go
|
24 |
api = HfApi()
|
@@ -28,7 +31,8 @@ os.makedirs("scored", exist_ok=True)
|
|
28 |
# Display the results
|
29 |
eval_results = {}
|
30 |
for level in range(1, 4):
|
31 |
-
eval_results[level] = load_dataset(f"{
|
|
|
32 |
|
33 |
eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
|
34 |
eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
|
@@ -37,12 +41,12 @@ eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
|
|
37 |
# Gold answers
|
38 |
gold_results = {}
|
39 |
for level in range(1, 4):
|
40 |
-
level_dataset = load_dataset(
|
41 |
gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset}
|
42 |
|
43 |
|
44 |
def restart_space():
|
45 |
-
api.restart_space(repo_id=
|
46 |
|
47 |
|
48 |
COLS = ["Model", "Score ⬆️", "Organisation"]
|
|
|
17 |
|
18 |
BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
|
19 |
|
20 |
+
OWNER="gaia-benchmark"
|
21 |
+
DATA_DATASET = f"{OWNER}/GAIA"
|
22 |
SUBMISSION_DATASET = f"{OWNER}/submissions"
|
23 |
+
RESULTS_DATASET = f"{OWNER}/results"
|
24 |
+
LEADERBOARD_PATH = f"{OWNER}/leaderboard"
|
25 |
|
26 |
SPLIT="validation" #Change to test once we are ready to go
|
27 |
api = HfApi()
|
|
|
31 |
# Display the results
|
32 |
eval_results = {}
|
33 |
for level in range(1, 4):
|
34 |
+
eval_results[level] = load_dataset(RESULTS_DATASET, f"2023_level{level}", use_auth_token=BALM_TOKEN, split=SPLIT)
|
35 |
+
|
36 |
|
37 |
eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
|
38 |
eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
|
|
|
41 |
# Gold answers
|
42 |
gold_results = {}
|
43 |
for level in range(1, 4):
|
44 |
+
level_dataset = load_dataset(DATA_DATASET, f"2023_level{level}", split=SPLIT, use_auth_token=BALM_TOKEN)
|
45 |
gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset}
|
46 |
|
47 |
|
48 |
def restart_space():
|
49 |
+
api.restart_space(repo_id=LEADERBOARD_PATH, token=BALM_TOKEN)
|
50 |
|
51 |
|
52 |
COLS = ["Model", "Score ⬆️", "Organisation"]
|
content.py
CHANGED
@@ -1,9 +1,4 @@
|
|
1 |
-
|
2 |
-
## [2023-06-02]
|
3 |
-
- Beta internal version of the leaderboard
|
4 |
-
"""
|
5 |
-
|
6 |
-
TITLE = """<h1 align="center" id="space-title">BALM Leaderboard</h1>"""
|
7 |
|
8 |
CANARY_STRING = "" # TODO
|
9 |
|
@@ -11,9 +6,9 @@ INTRODUCTION_TEXT = f"""
|
|
11 |
Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
|
12 |
|
13 |
To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
|
14 |
-
We therefore present
|
15 |
|
16 |
-
|
17 |
We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
|
18 |
|
19 |
Each of these levels is divided into two sets: a public dev set, on which people can self report their results, and a private test set, which will be unlocked once public performance passes a threshold on the dev set.
|
@@ -22,12 +17,12 @@ Please do not repost the public dev set, nor use it in training data for your mo
|
|
22 |
"""
|
23 |
|
24 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
25 |
-
CITATION_BUTTON_TEXT = r"""@misc{
|
26 |
author = {tbd},
|
27 |
-
title = {
|
28 |
year = {2023},
|
29 |
#publisher = {Hugging Face},
|
30 |
-
#howpublished = "\url{https://huggingface.co/spaces/
|
31 |
}"""
|
32 |
|
33 |
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title">GAIA Leaderboard</h1>"""
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
CANARY_STRING = "" # TODO
|
4 |
|
|
|
6 |
Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
|
7 |
|
8 |
To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
|
9 |
+
We therefore present GAIA.
|
10 |
|
11 |
+
GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
|
12 |
We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
|
13 |
|
14 |
Each of these levels is divided into two sets: a public dev set, on which people can self report their results, and a private test set, which will be unlocked once public performance passes a threshold on the dev set.
|
|
|
17 |
"""
|
18 |
|
19 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
20 |
+
CITATION_BUTTON_TEXT = r"""@misc{gaia, # TODO
|
21 |
author = {tbd},
|
22 |
+
title = {General AI Assistant benchamrk},
|
23 |
year = {2023},
|
24 |
#publisher = {Hugging Face},
|
25 |
+
#howpublished = "\url{https://huggingface.co/spaces/gaia-benchmark/}"
|
26 |
}"""
|
27 |
|
28 |
|