|
import os |
|
import shutil |
|
import numpy as np |
|
import gradio as gr |
|
from huggingface_hub import Repository, HfApi |
|
from transformers import AutoConfig |
|
import json |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
import pandas as pd |
|
import datetime |
|
import glob |
|
from dataclasses import dataclass |
|
from typing import List, Tuple, Dict |
|
|
|
H4_TOKEN = os.environ.get("H4_TOKEN", None) |
|
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] |
|
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"] |
|
BENCH_TO_NAME = { |
|
"arc_challenge":"ARC (25-shot) ⬆️", |
|
"hellaswag":"HellaSwag (10-shot) ⬆️", |
|
"hendrycks":"MMLU (5-shot) ⬆️", |
|
"truthfulqa_mc":"TruthQA (0-shot) ⬆️", |
|
} |
|
def make_clickable_model(model_name): |
|
|
|
|
|
|
|
link = "https://huggingface.co/" + model_name |
|
return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
@dataclass |
|
class EvalResult: |
|
org : str |
|
model : str |
|
is_8bit : bool |
|
results : dict |
|
|
|
def to_dict(self): |
|
data_dict = {} |
|
data_dict["base_model"] = make_clickable_model(f"{self.org}/{self.model}") |
|
data_dict["total ⬆️"] = sum([v for k,v in self.results.items()]) |
|
data_dict["# params"] = "unknown (todo)" |
|
|
|
for benchmark in BENCHMARKS: |
|
if not benchmark in self.results.keys(): |
|
self.results[benchmark] = None |
|
|
|
for k,v in BENCH_TO_NAME.items(): |
|
data_dict[v] = self.results[k] |
|
|
|
return data_dict |
|
|
|
|
|
|
|
|
|
def parse_eval_result(json_filepath: str) -> Tuple[str, dict]: |
|
with open(json_filepath) as fp: |
|
data = json.load(fp) |
|
|
|
path_split = json_filepath.split("/") |
|
org = None |
|
model = path_split[-3] |
|
is_8bit = path_split[-2] == "8bit" |
|
if len(path_split)== 5: |
|
|
|
result_key = f"{path_split[-3]}_{path_split[-2]}" |
|
else: |
|
result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}" |
|
org = path_split[-4] |
|
|
|
eval_result = None |
|
for benchmark, metric in zip(BENCHMARKS, METRICS): |
|
if benchmark in json_filepath: |
|
accs = np.array([v[metric] for k, v in data["results"].items()]) |
|
mean_acc = np.mean(accs) |
|
eval_result = EvalResult(org, model, is_8bit, {benchmark:mean_acc}) |
|
|
|
return result_key, eval_result |
|
|
|
|
|
|
|
|
|
def get_eval_results() -> List[EvalResult]: |
|
json_filepaths = glob.glob("evals/eval_results/**/*.json", recursive=True) |
|
eval_results = {} |
|
|
|
for json_filepath in json_filepaths: |
|
result_key, eval_result = parse_eval_result(json_filepath) |
|
if result_key in eval_results.keys(): |
|
eval_results[result_key].results.update(eval_result.results) |
|
else: |
|
eval_results[result_key] = eval_result |
|
|
|
|
|
eval_results = [v for k,v in eval_results.items()] |
|
|
|
return eval_results |
|
|
|
def get_eval_results_dicts() -> List[Dict]: |
|
eval_results = get_eval_results() |
|
|
|
return [e.to_dict() for e in eval_results] |
|
|
|
eval_results_dict = get_eval_results_dicts() |
|
print(eval_results_dict) |