import os import json import datetime from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings from scorer import question_scorer from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT BALM_TOKEN = os.environ.get("BALM_TOKEN", None) OWNER="gaia-benchmark" DATA_DATASET = f"{OWNER}/GAIA" SUBMISSION_DATASET = f"{OWNER}/submissions" RESULTS_DATASET = f"{OWNER}/results" LEADERBOARD_PATH = f"{OWNER}/leaderboard" SPLIT="validation" #Change to test once we are ready to go api = HfApi() os.makedirs("scored", exist_ok=True) # Display the results eval_results = {} for level in range(1, 4): eval_results[level] = load_dataset(RESULTS_DATASET, f"2023_level{level}", use_auth_token=BALM_TOKEN, split=SPLIT) eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail")) eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail")) eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail")) # Gold answers gold_results = {} for level in range(1, 4): level_dataset = load_dataset(DATA_DATASET, f"2023_level{level}", split=SPLIT, use_auth_token=BALM_TOKEN) gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset} def restart_space(): api.restart_space(repo_id=LEADERBOARD_PATH, token=BALM_TOKEN) COLS = ["Model", "Score ⬆️", "Organisation"] TYPES = ["str", "number", "str",] def add_new_eval( level_of_dev: str, model: str, path_to_file, organisation: str, mail: str, ): level = int(level_of_dev.split(" ")[-1]) # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") print("Adding new eval") # Check if the combination model/org already exists and prints a warning message if yes if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]): return format_warning("This model has been already submitted.") # Save submitted file api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=path_to_file.name, path_in_repo=f"{organisation}/{model}/level{level}_raw_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=BALM_TOKEN ) # Compute score file_path = path_to_file.name total_score = 0 with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: with open(file_path, 'r') as f: for line in f: task = json.loads(line) if "model_answer" not in task: raise Exception("No model_answer key in the file provided") answer = task["model_answer"] task_id = task["task_id"] score = question_scorer(task['model_answer'], gold_results[level][task_id]) scored_file.write( json.dumps({ "id": task_id, "model_answer": answer, "score": score }) + "\n" ) total_score += score # Save scored file api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=f"scored/{organisation}_{model}.jsonl", path_in_repo=f"{organisation}/{model}/level{level}_scored_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=BALM_TOKEN ) # Actual submission eval_entry = { "model": model, "score": total_score, "organisation": organisation, "mail": mail, } eval_results[level] = eval_results[level].add_item(eval_entry) # TODO: change split to "test" once we have the actual results eval_results[level].push_to_hub(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT) return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed") def refresh(): eval_results = {} for level in range(1, 4): eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split=SPLIT) eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail")) eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail")) eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail")) return eval_dataframe_1, eval_dataframe_2, eval_dataframe_3 def upload_file(files): file_paths = [file.name for file in files] return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ).style(show_copy_button=True) with gr.Tab("Results: Level 1"): leaderboard_table_1 = gr.components.Dataframe( value=eval_dataframe_1, headers=COLS, datatype=TYPES, interactive=False, ) with gr.Tab("Results: Level 2"): leaderboard_table_2 = gr.components.Dataframe( value=eval_dataframe_2, headers=COLS, datatype=TYPES, interactive=False, ) with gr.Tab("Results: Level 3"): leaderboard_table_3 = gr.components.Dataframe( value=eval_dataframe_3, headers=COLS, datatype=TYPES, interactive=False, ) refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ leaderboard_table_1, leaderboard_table_2, leaderboard_table_3, ], ) with gr.Accordion("Submit a new model for evaluation"): with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="{split} set level") model_name_textbox = gr.Textbox(label="Model name") file_output = gr.File() with gr.Column(): organisation = gr.Textbox(label="Organisation") mail = gr.Textbox(label="Contact email") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ level_of_test, model_name_textbox, file_output, organisation, mail ], submission_result, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.launch()