polish_eq-bench / app.py
Draedon's picture
Change the way the parsable questions are expressed from numerical to percentage
e9ebca3 verified
raw
history blame
4.53 kB
import json
import re
import gradio as gr
import numpy
import pandas as pd
from src.display.css_html_js import custom_css
from src.about import (
INTRODUCTION_TEXT,
TITLE,
AUTHORS,
)
from src.display.formatting import make_clickable_model
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
NUMBER_OF_QUESTIONS = 171.0
# load dataframe from csv
# leaderboard_df = pd.read_csv("benchmark_results.csv")
leaderboard_df = []
with open("benchmark_results.csv", "r") as f:
header = f.readline().strip().split(",")
header = [h.strip() for h in header]
for i, line in enumerate(f):
leaderboard_df.append(line.strip().split(",", 13))
metadata = json.load(open('metadata.json'))
for k, v in list(metadata.items()):
metadata[k.split(",")[0]] = v
# create dataframe from list and header
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
# filter column with value eq-bench_v2_pl
print(header)
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
# fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
# leave only defined columns
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
# create new column with model name
def parse_parseable(x):
if x["Num Questions Parseable"] == 'FAILED':
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
return m.group(1)
return x["Num Questions Parseable"]
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
lambda x: parse_parseable(x), axis=1)
def fraction_to_percentage(numerator: float, denominator: float) -> float:
return (numerator / denominator) * 100
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
def get_params(model_name):
if model_name in metadata:
return metadata[model_name]
return numpy.nan
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
# move column order
leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
# change value of column to nan
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
# set datatype of column
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
# set nan if value of column is less than 0
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
# sort by 2 columns
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
ascending=[False, False])
# rename columns
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
leaderboard_df_styled = leaderboard_df_styled.background_gradient(cmap="RdYlGn_r", subset=['Params'])
rounding = {}
# for col in ["Benchmark Score", "Num Questions Parseable"]:
rounding["Benchmark Score"] = "{:.2f}"
rounding["Percentage Questions Parseable"] = "{:.2f}"
rounding["Params"] = "{:.0f}"
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
leaderboard_table = gr.components.Dataframe(
value=leaderboard_df_styled,
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
datatype=['markdown', 'number', 'number', 'number', 'str'],
elem_id="leaderboard-table",
interactive=False,
visible=True,
)
gr.Markdown(AUTHORS, elem_classes="markdown-text")
demo.queue(default_concurrency_limit=40).launch()