|
import os |
|
import shutil |
|
import numpy as np |
|
import gradio as gr |
|
from huggingface_hub import Repository |
|
import json |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
import pandas as pd |
|
|
|
H4_TOKEN = os.environ.get("H4_TOKEN", None) |
|
repo=None |
|
if H4_TOKEN: |
|
|
|
|
|
|
|
|
|
|
|
repo = Repository( |
|
local_dir="./evals/", clone_from="HuggingFaceH4/lmeh_evaluations", use_auth_token=H4_TOKEN, repo_type="dataset" |
|
) |
|
repo.git_pull() |
|
|
|
|
|
|
|
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"] |
|
BENCH_TO_NAME = { |
|
"arc_challenge":"ARC", |
|
"hellaswag":"HellaSwag", |
|
"hendrycks":"MMLU", |
|
"truthfulqa_mc":"TruthQA", |
|
} |
|
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] |
|
|
|
entries = [entry for entry in os.listdir("evals") if not entry.startswith('.')] |
|
model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))] |
|
|
|
|
|
def make_clickable_model(model_name): |
|
|
|
|
|
|
|
link = "https://huggingface.co/" + model_name |
|
return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
def load_results(model, benchmark, metric): |
|
file_path = os.path.join("evals", model, f"{model}-eval_{benchmark}.json") |
|
if not os.path.exists(file_path): |
|
return 0.0, None |
|
|
|
with open(file_path) as fp: |
|
data = json.load(fp) |
|
accs = np.array([v[metric] for k, v in data["results"].items()]) |
|
mean_acc = np.mean(accs) |
|
return mean_acc, data["config"]["model_args"] |
|
|
|
COLS = ["eval_name", "total", "ARC", "HellaSwag", "MMLU", "TruthQA", "base_model"] |
|
TYPES = ["str", "number", "number", "number", "number", "number","markdown", ] |
|
def get_leaderboard(): |
|
if repo: |
|
repo.git_pull() |
|
all_data = [] |
|
for model in model_directories: |
|
model_data = {"base_model": None} |
|
model_data = {"eval_name": model} |
|
|
|
for benchmark, metric in zip(BENCHMARKS, METRICS): |
|
value, base_model = load_results(model, benchmark, metric) |
|
model_data[BENCH_TO_NAME[benchmark]] = value |
|
if base_model is not None: |
|
model_data["base_model"] = base_model |
|
|
|
model_data["total"] = sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values()) |
|
|
|
if model_data["base_model"] is not None: |
|
model_data["base_model"] = make_clickable_model(model_data["base_model"]) |
|
all_data.append(model_data) |
|
|
|
dataframe = pd.DataFrame.from_records(all_data) |
|
dataframe = dataframe.sort_values(by=['total'], ascending=False) |
|
|
|
dataframe = dataframe[COLS] |
|
return dataframe |
|
|
|
leaderboard = get_leaderboard() |
|
|
|
block = gr.Blocks() |
|
with block: |
|
gr.Markdown(f""" |
|
# H4 Model Evaluation leaderboard using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> LMEH benchmark suite </a>. |
|
Evaluation is performed against 4 popular benchmarks AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthFul QC MC. To run your own benchmarks, refer to the README in the H4 repo. |
|
""") |
|
|
|
with gr.Row(): |
|
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS, |
|
datatype=TYPES, max_rows=5) |
|
with gr.Row(): |
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click(get_leaderboard, inputs=[], outputs=leaderboard_table) |
|
|
|
|
|
|
|
block.launch() |
|
|
|
def refresh_leaderboard(): |
|
leaderboard_table = get_leaderboard() |
|
print("leaderboard updated") |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(func=refresh_leaderboard, trigger="interval", seconds=300) |
|
scheduler.start() |