|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
from os.path import split as path_split, splitext as path_splitext |
|
|
|
st.set_page_config( |
|
page_title="PPE Metrics Explorer", |
|
layout="wide", |
|
initial_sidebar_state="expanded", |
|
) |
|
|
|
|
|
st.title("PPE Metrics Explorer") |
|
|
|
@st.cache_data |
|
def load_data(file_path): |
|
""" |
|
Load json data from a file. |
|
""" |
|
with open(file_path, 'r') as file: |
|
data = json.load(file) |
|
return data |
|
|
|
def contains_list(column): |
|
return column.apply(lambda x: isinstance(x, list)).any() |
|
|
|
INVERT = {'brier', 'loss'} |
|
|
|
SCALE = {'accuracy', 'row-wise pearson', 'confidence_agreement', 'spearman', 'kendalltau', 'arena_under_curve', 'mean_max_score', 'mean_end_score'} |
|
|
|
def main(): |
|
|
|
data = load_data('results.json') |
|
|
|
|
|
benchmarks = list(sorted(data.keys(), key=lambda s: "A" + s if s == "human_preference_v1" else s)) |
|
|
|
|
|
selected_benchmark = st.selectbox("Select a Benchmark", benchmarks) |
|
|
|
|
|
benchmark_data = data[selected_benchmark] |
|
|
|
|
|
records = [] |
|
|
|
|
|
for model, metrics in benchmark_data.items(): |
|
|
|
model_type = "LLM Judge" if model.endswith(".jsonl") else "Reward Model" |
|
|
|
model = path_split(path_splitext(model)[0])[-1] |
|
|
|
|
|
|
|
if isinstance(metrics, dict): |
|
|
|
|
|
flattened_metrics = {} |
|
for subkey, submetrics in metrics.items(): |
|
if isinstance(submetrics, dict): |
|
for metric_name, value in submetrics.items(): |
|
|
|
if metric_name in SCALE: |
|
|
|
value = 100 * value |
|
|
|
if metric_name in INVERT: |
|
key = f"{subkey} - (1 - {metric_name})" |
|
flattened_metrics[key] = 1 - value |
|
else: |
|
key = f"{subkey} - {metric_name}" |
|
flattened_metrics[key] = value |
|
else: |
|
flattened_metrics[subkey] = submetrics |
|
|
|
records.append({ |
|
"Model": model, |
|
"Type": model_type, |
|
**flattened_metrics |
|
}) |
|
else: |
|
|
|
records.append({ |
|
"Model": model, |
|
"Type": model_type, |
|
"Value": metrics |
|
}) |
|
|
|
|
|
df = pd.DataFrame(records) |
|
|
|
|
|
df = df.loc[:, ~df.apply(contains_list)] |
|
|
|
if "human" not in selected_benchmark: |
|
df = df[sorted(df.columns, key=lambda s: s.replace("(1", "l").lower() if s != "Type" else "A")] |
|
|
|
|
|
df.set_index(["Model"], inplace=True) |
|
|
|
|
|
|
|
col1, col2, col3 = st.columns([1, 1, 2]) |
|
with col1: |
|
|
|
column_search = st.text_input("", placeholder="Search metrics...", key="search") |
|
|
|
with col2: |
|
|
|
model_search = st.text_input("", placeholder="Filter Models (separate criteria with ,) ...", key="search2") |
|
|
|
model_search_crit = model_search.replace(", ", "|").replace(",", "|") |
|
|
|
if column_search: |
|
|
|
filtered_columns = ["Type"] + [col for col in df.columns if column_search.lower() in col.lower()] |
|
if filtered_columns: |
|
df_display = df[filtered_columns] |
|
else: |
|
st.warning("No columns match your search.") |
|
df_display = pd.DataFrame() |
|
else: |
|
|
|
df_display = df |
|
|
|
if model_search: |
|
|
|
df_display = df_display[df_display.index.str.contains(model_search_crit, case=False)] |
|
|
|
if len(df_display) == 0: |
|
st.warning("No models match your filter.") |
|
df_display = pd.DataFrame() |
|
|
|
|
|
|
|
|
|
st.dataframe(df_display.sort_values(df_display.columns[1], ascending=False).style.background_gradient(cmap='summer_r', axis=0).format(precision=4) |
|
if len(df_display) else df_display, use_container_width=True, height=500) |
|
|
|
|
|
csv = df_display.to_csv() |
|
st.download_button( |
|
label="Download data as CSV", |
|
data=csv, |
|
file_name=f"{selected_benchmark}_metrics.csv", |
|
mime='text/csv', |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|