import os import json import glob from collections import defaultdict import pandas as pd import gradio as gr from content import * from css import * import glob AFRIMMLU_DIRECT = "afrimmlu_direct" AFRIMMLU_TRANSLATE = "afrimmlu_translate" AFRIXNLI_DIRECT = "afrixnli_direct" AFRIXNLI_TRANSLATE = "afrixnli_translate" BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE] METRICS = ["acc", "acc_stderr", "f1"] LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul'] LANG_NAME = { 'amh': 'Amharic', 'eng': 'English', 'ewe': 'Ewe', 'fra': 'French', 'hau': 'Hausa', 'ibo': 'Igbo', 'kin': 'Kinyarwanda', 'lin': 'Lingala', 'lug': 'Luganda', 'orm': 'Oromo', 'sna': 'Shona', 'sot': 'Sotho', 'swa': 'Swahili', 'twi': 'Twi', 'wol': 'Wolof', 'xho': 'Xhosa', 'yor': 'Yoruba', 'zul': 'Zulu' } def collect_results(): performance_dict = defaultdict(dict) pretrained_models = set() for file in glob.glob('evals/*/*.json'): with open(file, 'r') as f: data = json.load(f) if 'results' not in data: continue if 'config' not in data: continue results = data['results'] config = data['config'] if 'model_args' not in config: continue model_args = config['model_args'].split(',') pretrained = [x for x in model_args if x.startswith('pretrained=')] if len(pretrained) != 1: continue pretrained = pretrained[0].split('=')[1] pretrained = pretrained.split('/')[-1] pretrained_models.add(pretrained) for lang_task, perfs in results.items(): lang_task = lang_task.split('_') lang = lang_task[-1] task = '_'.join(lang_task[:-1]) assert task in BENCHMARKS if lang and task: metric = METRICS[BENCHMARKS.index(task)-1] p = round(perfs[metric] * 100, 1) performance_dict[(pretrained, lang)][task] = p return performance_dict, pretrained_models def get_leaderboard_df(performance_dict, pretrained_models): df = list() for (pretrained, lang), perfs in performance_dict.items(): lang_name = LANG_NAME[lang] afrimmlu_direct_perf = perfs.get(AFRIMMLU_DIRECT, 0.0) afrimmlu_translate_perf = perfs.get(AFRIMMLU_TRANSLATE, 0.0) afrixnli_direct_perf = perfs.get(AFRIXNLI_DIRECT, 0.0) afrixnli_translate_perf = perfs.get(AFRIXNLI_TRANSLATE, 0.0) # if afrimmlu_direct_perf * afrimmlu_translate_perf * afrixnli_direct_perf * afrixnli_translate_perf == 0: # continue average_divide = [1 if afrimmlu_direct_perf else 0, 1 if afrimmlu_translate_perf else 0, 1 if afrixnli_direct_perf else 0, 1 if afrixnli_translate_perf else 0] avg = round((afrimmlu_direct_perf + afrimmlu_translate_perf + afrixnli_direct_perf + afrixnli_translate_perf) / sum(average_divide), 1) notes = ' '.join([pretrained, lang_name]) row = [pretrained, lang_name, lang, avg, afrimmlu_direct_perf, afrimmlu_translate_perf, afrixnli_direct_perf, afrixnli_translate_perf, notes] df.append(row) df = pd.DataFrame.from_records(df, columns=COLS) df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False) df = df[COLS] return df def search_table(df, query): filtered_df = df[df[NOTES_COL].str.contains(query, case=False)] return filtered_df MODEL_COL = "Model" LANG_COL = "Language" CODE_COL = "Code" AVERAGE_COL = "Average" AFRIMMLU_DIRECT_COL = "AfriMMLU Direct (0-Shot)" AFRIMMLU_TRANSLATE_COL = "AfriMMLU Translate (0-Shot)" AFRIXNLI_DIRECT_COL = "AfriXNLI Direct (0-Shot)" AFRIXNLI_TRANSLATE_COL = "AfriXNLI Translate (0-Shot)" NOTES_COL = "Notes" # For search only COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, AFRIMMLU_DIRECT_COL, AFRIMMLU_TRANSLATE_COL, AFRIXNLI_DIRECT_COL, AFRIXNLI_TRANSLATE_COL, NOTES_COL] TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"] args = collect_results() original_df = get_leaderboard_df(*args) demo = gr.Blocks(css=CUSTOM_CSS) with demo: gr.HTML(TITLE) gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") gr.Markdown(HOW_TO, elem_classes="markdown-text") with gr.Group(): search_bar = gr.Textbox( placeholder="Search models and languages...", show_label=False, elem_id="search-bar" ) leaderboard_table = gr.components.Dataframe( value=original_df, headers=COLS, datatype=TYPES, # max_rows=5, elem_id="leaderboard-table", ) # # Dummy leaderboard for handling the case when the user uses backspace key hidden_leaderboard_table_for_search = gr.components.Dataframe( value=original_df, headers=COLS, datatype=TYPES, # max_rows=5, visible=False ) search_bar.change( search_table, [hidden_leaderboard_table_for_search, search_bar], leaderboard_table, ) gr.Markdown(CREDIT, elem_classes="markdown-text") gr.Markdown(CITATION, elem_classes="markdown-text") demo.launch()