Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
•
5f9d165
1
Parent(s):
d0c2655
small visu fixes
Browse files- app.py +40 -17
- content.py +16 -7
app.py
CHANGED
@@ -13,7 +13,7 @@ from huggingface_hub import HfApi
|
|
13 |
|
14 |
# InfoStrings
|
15 |
from scorer import question_scorer
|
16 |
-
from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
|
17 |
|
18 |
TOKEN = os.environ.get("TOKEN", None)
|
19 |
|
@@ -21,7 +21,7 @@ OWNER="gaia-benchmark"
|
|
21 |
DATA_DATASET = f"{OWNER}/GAIA"
|
22 |
INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
|
23 |
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
|
24 |
-
RESULTS_DATASET = f"{OWNER}/
|
25 |
LEADERBOARD_PATH = f"{OWNER}/leaderboard"
|
26 |
api = HfApi()
|
27 |
|
@@ -30,27 +30,40 @@ YEAR_VERSION = "2023"
|
|
30 |
os.makedirs("scored", exist_ok=True)
|
31 |
|
32 |
# Display the results
|
33 |
-
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION,
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
# Gold answers
|
38 |
gold_results = {}
|
39 |
-
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all",
|
40 |
gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
|
41 |
|
42 |
|
43 |
def restart_space():
|
44 |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
45 |
|
46 |
-
|
47 |
-
COLS = ["Model", "Score ⬆️", "Organisation"]
|
48 |
-
TYPES = ["str", "number", "str",]
|
49 |
|
50 |
def add_new_eval(
|
51 |
val_or_test: str,
|
52 |
model: str,
|
53 |
-
|
|
|
|
|
|
|
54 |
organisation: str,
|
55 |
mail: str,
|
56 |
):
|
@@ -120,6 +133,9 @@ def add_new_eval(
|
|
120 |
# Actual submission
|
121 |
eval_entry = {
|
122 |
"model": model,
|
|
|
|
|
|
|
123 |
"organisation": organisation,
|
124 |
"mail": mail,
|
125 |
"score": scores["all"]/num_questions["all"],
|
@@ -131,13 +147,13 @@ def add_new_eval(
|
|
131 |
print(eval_results)
|
132 |
eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
|
133 |
|
134 |
-
return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait
|
135 |
|
136 |
|
137 |
def refresh():
|
138 |
-
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION,
|
139 |
-
eval_dataframe_val =
|
140 |
-
eval_dataframe_test =
|
141 |
return eval_dataframe_val, eval_dataframe_test
|
142 |
|
143 |
def upload_file(files):
|
@@ -160,11 +176,11 @@ with demo:
|
|
160 |
|
161 |
with gr.Tab("Results: Validation"):
|
162 |
leaderboard_table_val = gr.components.Dataframe(
|
163 |
-
value=eval_dataframe_val,
|
164 |
)
|
165 |
with gr.Tab("Results: Test"):
|
166 |
leaderboard_table_test = gr.components.Dataframe(
|
167 |
-
value=eval_dataframe_test,
|
168 |
)
|
169 |
|
170 |
refresh_button = gr.Button("Refresh")
|
@@ -181,10 +197,14 @@ with demo:
|
|
181 |
with gr.Column():
|
182 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
183 |
model_name_textbox = gr.Textbox(label="Model name")
|
184 |
-
|
|
|
|
|
185 |
with gr.Column():
|
186 |
organisation = gr.Textbox(label="Organisation")
|
187 |
mail = gr.Textbox(label="Contact email")
|
|
|
|
|
188 |
|
189 |
submit_button = gr.Button("Submit Eval")
|
190 |
submission_result = gr.Markdown()
|
@@ -193,6 +213,9 @@ with demo:
|
|
193 |
[
|
194 |
level_of_test,
|
195 |
model_name_textbox,
|
|
|
|
|
|
|
196 |
file_output,
|
197 |
organisation,
|
198 |
mail
|
|
|
13 |
|
14 |
# InfoStrings
|
15 |
from scorer import question_scorer
|
16 |
+
from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
|
17 |
|
18 |
TOKEN = os.environ.get("TOKEN", None)
|
19 |
|
|
|
21 |
DATA_DATASET = f"{OWNER}/GAIA"
|
22 |
INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
|
23 |
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
|
24 |
+
RESULTS_DATASET = f"{OWNER}/results_public"
|
25 |
LEADERBOARD_PATH = f"{OWNER}/leaderboard"
|
26 |
api = HfApi()
|
27 |
|
|
|
30 |
os.makedirs("scored", exist_ok=True)
|
31 |
|
32 |
# Display the results
|
33 |
+
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload")
|
34 |
+
def get_dataframe_from_results(eval_results, split):
|
35 |
+
local_df = eval_results[split]
|
36 |
+
local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
|
37 |
+
local_df = local_df.remove_columns(["mail", "system_prompt", "url"])
|
38 |
+
local_df = local_df.rename_column("model", "Model name")
|
39 |
+
local_df = local_df.rename_column("model_family", "Model family")
|
40 |
+
local_df = local_df.rename_column("score", "Average score (%)")
|
41 |
+
for i in [1, 2, 3]:
|
42 |
+
local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
|
43 |
+
df = pd.DataFrame(local_df)
|
44 |
+
return df
|
45 |
+
|
46 |
+
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
|
47 |
+
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
|
48 |
|
49 |
# Gold answers
|
50 |
gold_results = {}
|
51 |
+
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN)
|
52 |
gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
|
53 |
|
54 |
|
55 |
def restart_space():
|
56 |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
57 |
|
58 |
+
TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
|
|
|
|
|
59 |
|
60 |
def add_new_eval(
|
61 |
val_or_test: str,
|
62 |
model: str,
|
63 |
+
model_family: str,
|
64 |
+
system_prompt: str,
|
65 |
+
url: str,
|
66 |
+
path_to_file: str,
|
67 |
organisation: str,
|
68 |
mail: str,
|
69 |
):
|
|
|
133 |
# Actual submission
|
134 |
eval_entry = {
|
135 |
"model": model,
|
136 |
+
"model_family": model_family,
|
137 |
+
"system_prompt": system_prompt,
|
138 |
+
"url": url,
|
139 |
"organisation": organisation,
|
140 |
"mail": mail,
|
141 |
"score": scores["all"]/num_questions["all"],
|
|
|
147 |
print(eval_results)
|
148 |
eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
|
149 |
|
150 |
+
return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
|
151 |
|
152 |
|
153 |
def refresh():
|
154 |
+
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload")
|
155 |
+
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
|
156 |
+
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
|
157 |
return eval_dataframe_val, eval_dataframe_test
|
158 |
|
159 |
def upload_file(files):
|
|
|
176 |
|
177 |
with gr.Tab("Results: Validation"):
|
178 |
leaderboard_table_val = gr.components.Dataframe(
|
179 |
+
value=eval_dataframe_val, datatype=TYPES, interactive=False,
|
180 |
)
|
181 |
with gr.Tab("Results: Test"):
|
182 |
leaderboard_table_test = gr.components.Dataframe(
|
183 |
+
value=eval_dataframe_test, datatype=TYPES, interactive=False,
|
184 |
)
|
185 |
|
186 |
refresh_button = gr.Button("Refresh")
|
|
|
197 |
with gr.Column():
|
198 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
199 |
model_name_textbox = gr.Textbox(label="Model name")
|
200 |
+
model_family_textbox = gr.Textbox(label="Model family")
|
201 |
+
system_prompt_textbox = gr.Textbox(label="System prompt example")
|
202 |
+
url_textbox = gr.Textbox(label="Url to model information")
|
203 |
with gr.Column():
|
204 |
organisation = gr.Textbox(label="Organisation")
|
205 |
mail = gr.Textbox(label="Contact email")
|
206 |
+
file_output = gr.File()
|
207 |
+
|
208 |
|
209 |
submit_button = gr.Button("Submit Eval")
|
210 |
submission_result = gr.Markdown()
|
|
|
213 |
[
|
214 |
level_of_test,
|
215 |
model_name_textbox,
|
216 |
+
model_family_textbox,
|
217 |
+
system_prompt_textbox,
|
218 |
+
url_textbox,
|
219 |
file_output,
|
220 |
organisation,
|
221 |
mail
|
content.py
CHANGED
@@ -2,18 +2,25 @@ TITLE = """<h1 align="center" id="space-title">GAIA Leaderboard</h1>"""
|
|
2 |
|
3 |
CANARY_STRING = "" # TODO
|
4 |
|
5 |
-
INTRODUCTION_TEXT =
|
6 |
Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
|
7 |
-
|
8 |
To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
|
9 |
We therefore present GAIA.
|
10 |
|
11 |
GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
|
12 |
We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
|
16 |
-
Please do not repost the public dev set, nor use it in training data for your models.
|
17 |
"""
|
18 |
|
19 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
@@ -21,8 +28,6 @@ CITATION_BUTTON_TEXT = r"""@misc{gaia, # TODO
|
|
21 |
author = {tbd},
|
22 |
title = {General AI Assistant benchamrk},
|
23 |
year = {2023},
|
24 |
-
#publisher = {Hugging Face},
|
25 |
-
#howpublished = "\url{https://huggingface.co/spaces/gaia-benchmark/}"
|
26 |
}"""
|
27 |
|
28 |
|
@@ -30,4 +35,8 @@ def format_warning(msg):
|
|
30 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
31 |
|
32 |
def format_log(msg):
|
33 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
CANARY_STRING = "" # TODO
|
4 |
|
5 |
+
INTRODUCTION_TEXT = """
|
6 |
Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
|
|
|
7 |
To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
|
8 |
We therefore present GAIA.
|
9 |
|
10 |
GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
|
11 |
We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
|
12 |
+
Each of these levels is divided into two sets: a fully public dev set, on which people can test their models, and a test set with private answers and metadata. Results can be submitted for both validation and test.
|
13 |
+
|
14 |
+
We expect submissions to be json-line files with the following format:
|
15 |
+
```
|
16 |
+
{"task_id": "task_id_1", "model_answer": "Answer 1 from your model", "reasoning_trace": "The different steps by which your model reached answer 1"}
|
17 |
+
{"task_id": "task_id_2", "model_answer": "Answer 2 from your model", "reasoning_trace": "The different steps by which your model reached answer 2"}
|
18 |
+
...
|
19 |
+
```
|
20 |
|
21 |
+
Scores are expressed as the percentage of correct answers for a given split.
|
22 |
|
23 |
+
Please do not repost the public dev set, nor use it in training data for your models.
|
24 |
"""
|
25 |
|
26 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
28 |
author = {tbd},
|
29 |
title = {General AI Assistant benchamrk},
|
30 |
year = {2023},
|
|
|
|
|
31 |
}"""
|
32 |
|
33 |
|
|
|
35 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
36 |
|
37 |
def format_log(msg):
|
38 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
39 |
+
|
40 |
+
def model_hyperlink(link, model_name):
|
41 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
42 |
+
|