reward-bench / app.py
natolambert's picture
updates
f5220e7
raw
history blame
13.4 kB
import gradio as gr
import os
from huggingface_hub import HfApi, snapshot_download
from apscheduler.schedulers.background import BackgroundScheduler
from datasets import load_dataset
from src.utils import load_all_data
from src.md import ABOUT_TEXT, TOP_TEXT
from src.plt import plot_avg_correlation
from src.constants import subset_mapping, length_categories, example_counts
import numpy as np
api = HfApi()
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
evals_repo = "ai2-adapt-dev/HERM-Results"
eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
repo_dir_herm = "./evals/herm/"
def restart_space():
api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
print("Pulling evaluation results")
repo = snapshot_download(
local_dir=repo_dir_herm,
ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
repo_id=evals_repo,
use_auth_token=COLLAB_TOKEN,
tqdm_class=None,
etag_timeout=30,
repo_type="dataset",
)
def avg_over_herm(dataframe_core, dataframe_prefs):
"""
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
We average over 4 core sections (per prompt weighting):
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
"""
new_df = dataframe_core.copy()
dataframe_prefs = dataframe_prefs.copy()
# for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
for subset, sub_subsets in subset_mapping.items():
subset_cols = [col for col in new_df.columns if col in sub_subsets]
sub_data = new_df[subset_cols].values # take the relevant column values
sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) # take the weighted average
# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
data_cols = list(subset_mapping.keys())
keep_columns = ["model",] + ["model_type"] + data_cols
# keep_columns = ["model", "average"] + subsets
new_df = new_df[keep_columns]
# selected average from pref_sets
pref_columns = ["anthropic_helpful", "mtbench_gpt4", "shp", "summarize"]
pref_data = dataframe_prefs[pref_columns].values
# add column test sets knowing the rows are not identical, take superset
dataframe_prefs["Test Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
# add column Test Sets empty to new_df
new_df["Test Sets"] = np.nan
# per row in new_df if model is in dataframe_prefs, add the value to new_df["Test Sets"]
values = []
for i, row in new_df.iterrows():
model = row["model"]
if model in dataframe_prefs["model"].values:
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0])
# new_df.at[i, "Test Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0]
else:
values.append(np.nan)
new_df["Test Sets"] = values
# add total average
data_cols += ["Test Sets"]
new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
# make average third column
keep_columns = ["model", "model_type", "average"] + data_cols
new_df = new_df[keep_columns]
return new_df
def expand_subsets(dataframe):
# TODO need to modify data/ script to do this
pass
def length_bias_check(dataframe):
"""
Takes the raw herm dataframe and splits the data into new buckets according to length_categories.
Then, take the average of the three buckets as "average"
"""
new_df = dataframe.copy()
existing_subsets = new_df.columns[3:] # model, model_type, average
final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
# new data is empty list dict for each final subset
new_data = {s: [] for s in final_subsets}
# now, subsets correspond to those with True, Nuetral, and False length bias
# check if length_categories[subset] == "True" or "False" or "Neutral"
for subset in existing_subsets:
subset_data = new_df[subset].values
subset_length = length_categories[subset]
# route to the correct bucket
if subset_length == "True":
new_data["Length Bias"].append(subset_data)
elif subset_length == "Neutral":
new_data["Neutral"].append(subset_data)
elif subset_length == "False":
new_data["Terse Bias"].append(subset_data)
# take average of new_data and add to new_df (removing other columns than model)
for subset in final_subsets:
new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2)
keep_columns = ["model"] + final_subsets
new_df = new_df[keep_columns]
# recompute average
# new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
return new_df
herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
herm_data_avg = avg_over_herm(herm_data, prefs_data).sort_values(by='average', ascending=False)
col_types_herm = ["markdown"] + ["str"] + ["number"] * (len(herm_data.columns) - 1)
col_types_herm_avg = ["markdown"]+ ["str"] + ["number"] * (len(herm_data_avg.columns) - 1)
cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1)
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
# for showing random samples
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
def random_sample(r: gr.Request, subset):
if subset is None or subset == []:
sample_index = np.random.randint(0, len(eval_set) - 1)
sample = eval_set[sample_index]
else: # filter by subsets (can be list)
if isinstance(subset, str):
subset = [subset]
# filter down dataset to only include the subset(s)
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
sample = eval_set_filtered[sample_index]
markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
return markdown_text
subsets = eval_set.unique("subset")
def regex_table(dataframe, regex, filter_button):
"""
Takes a model name as a regex, then returns only the rows that has that in it.
"""
# Split regex statement by comma and trim whitespace around regexes
regex_list = [x.strip() for x in regex.split(",")]
# Join the list into a single regex pattern with '|' acting as OR
combined_regex = '|'.join(regex_list)
# if filter_button, remove all rows with "ai2" in the model name
if (not filter_button) and ("ai2" not in regex):
dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)]
# Filter the dataframe such that 'model' contains any of the regex patterns
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
with gr.Blocks() as app:
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
with gr.Row():
with gr.Column(scale=3):
gr.Markdown(TOP_TEXT)
with gr.Column(scale=2):
search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("HERM Eval Set - Overview"):
with gr.Row():
# reference data
herm_table_hidden = gr.Dataframe(
herm_data_avg.values,
datatype=col_types_herm_avg,
headers=herm_data_avg.columns.tolist(),
visible=False,
)
herm_table = gr.Dataframe(
regex_table(herm_data_avg.copy(), "", False).values,
datatype=col_types_herm_avg,
headers=herm_data_avg.columns.tolist(),
elem_id="herm_dataframe_avg",
height=1000,
)
with gr.TabItem("HERM Eval Set - Detailed"):
with gr.Row():
# ref data
herm_table_detailed_hidden = gr.Dataframe(
herm_data.values,
datatype=col_types_herm,
headers=herm_data.columns.tolist(),
visible=False,
)
herm_table_detailed = gr.Dataframe(
regex_table(herm_data.copy(), "", False).values,
datatype=col_types_herm,
headers=herm_data.columns.tolist(),
elem_id="herm_dataframe",
height=1000,
)
with gr.TabItem("HERM Eval Set - Length Bias"):
with gr.Row():
# backup
herm_table_len_hidden = gr.Dataframe(
herm_data_length.values,
datatype=cols_herm_data_length,
headers=herm_data_length.columns.tolist(),
visible=False,
)
herm_table_len = gr.Dataframe(
regex_table(herm_data_length.copy(), "", False).values,
datatype=cols_herm_data_length,
headers=herm_data_length.columns.tolist(),
elem_id="herm_dataframe_length",
height=1000,
)
with gr.TabItem("Known Pref. Sets"):
with gr.Row():
PREF_SET_TEXT = """
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
"""
gr.Markdown(PREF_SET_TEXT)
with gr.Row():
# backup
pref_sets_table_hidden = gr.Dataframe(
prefs_data.values,
datatype=col_types_prefs,
headers=prefs_data.columns.tolist(),
visible=False,
)
pref_sets_table = gr.Dataframe(
regex_table(prefs_data.copy(), "", False).values,
datatype=col_types_prefs,
headers=prefs_data.columns.tolist(),
elem_id="prefs_dataframe",
height=1000,
)
with gr.TabItem("About"):
with gr.Row():
gr.Markdown(ABOUT_TEXT)
with gr.TabItem("Dataset Viewer"):
with gr.Row():
# loads one sample
gr.Markdown("## Random Dataset Sample Viewer")
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
button = gr.Button("Show Random Sample")
with gr.Row():
sample_display = gr.Markdown("{sampled data loads here}")
button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
# removed plot because not pretty enough
# with gr.TabItem("Model Correlation"):
# with gr.Row():
# plot = plot_avg_correlation(herm_data_avg, prefs_data)
# gr.Plot(plot)
search.change(regex_table, inputs=[herm_table_hidden, search, filter_button], outputs=herm_table)
search.change(regex_table, inputs=[herm_table_detailed_hidden, search, filter_button], outputs=herm_table_detailed)
search.change(regex_table, inputs=[herm_table_len_hidden, search, filter_button], outputs=herm_table_len)
search.change(regex_table, inputs=[pref_sets_table_hidden, search, filter_button], outputs=pref_sets_table)
# Load data when app starts, TODO make this used somewhere...
# def load_data_on_start():
# data_herm = load_all_data(repo_dir_herm)
# herm_table.update(data_herm)
# data_herm_avg = avg_over_herm(repo_dir_herm)
# herm_table.update(data_herm_avg)
# data_prefs = load_all_data(repo_dir_prefs)
# pref_sets_table.update(data_prefs)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
scheduler.start()
app.launch() # had .queue() before launch before... not sure if that's necessary