open_pl_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

open_pl_llm_leaderboard / visualizations.py

sheonhan

Add GPT-4 & human eval tab

0227006 over 1 year ago

raw

history blame

4.47 kB

	import math

	import numpy as np
	import pandas as pd
	import plotly.express as px


	# 1
	def compute_pairwise_win_fraction(battles):
	# Times each model wins as Model A
	a_win_ptbl = pd.pivot_table(
	battles[battles["win"] == "model_a"],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)

	# Table counting times each model wins as Model B
	b_win_ptbl = pd.pivot_table(
	battles[battles["win"] == "model_b"],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)

	# Table counting number of A-B pairs
	num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)

	# Computing the proportion of wins for each model as A and as B
	# against all other models
	row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T)

	# Arrange ordering according to proprition of wins
	prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
	model_names = list(prop_wins.keys())
	row_beats_col = row_beats_col_freq.loc[model_names, model_names]
	return row_beats_col


	def visualize_pairwise_win_fraction(battles, title):
	row_beats_col = compute_pairwise_win_fraction(battles)
	fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title)
	fig.update_layout(
	xaxis_title="Model B",
	yaxis_title="Model A",
	xaxis_side="top",
	title_y=0.07,
	title_x=0.5,
	)
	fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
	return fig


	# 2
	def switch_model_a_b(df):
	df_switch = df.copy()
	# switch with probability 0.5
	for i, row in df.iterrows():
	if np.random.rand() < 0.5:
	df_switch.at[i, "model_a"] = row["model_b"]
	df_switch.at[i, "model_b"] = row["model_a"]
	if row["win"] == "model_a":
	df_switch.at[i, "win"] = "model_b"
	elif row["win"] == "model_b":
	df_switch.at[i, "win"] = "model_a"
	return df_switch


	def visualize_battle_count(battles, title):
	ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
	battle_counts = ptbl + ptbl.T
	ordering = battle_counts.sum().sort_values(ascending=False).index
	fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600)
	fig.update_layout(
	xaxis_title="Model B",
	yaxis_title="Model A",
	xaxis_side="top",
	title_y=0.07,
	title_x=0.5,
	)
	fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
	return fig


	# 3
	def get_bootstrap_result(battles, func_compute_elo, num_round):
	rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)]
	df = pd.DataFrame(rows)
	return df[df.median().sort_values(ascending=False).index]


	def visualize_bootstrap_scores(df, title):
	bars = (
	pd.DataFrame(
	dict(
	lower=df.quantile(0.025),
	rating=df.quantile(0.5),
	upper=df.quantile(0.975),
	)
	)
	.reset_index(names="model")
	.sort_values("rating", ascending=False)
	)
	bars["error_y"] = bars["upper"] - bars["rating"]
	bars["error_y_minus"] = bars["rating"] - bars["lower"]
	bars["rating_rounded"] = np.round(bars["rating"], 2)
	fig = px.scatter(
	bars,
	x="model",
	y="rating",
	error_y="error_y",
	error_y_minus="error_y_minus",
	text="rating_rounded",
	title=title,
	)
	fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
	return fig


	# 4
	def visualize_rating_count(df, title):
	df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts()
	fig = px.bar(df_all_value_counts, title=title, text_auto=True)

	min_y = df_all_value_counts.min()
	max_y = df_all_value_counts.max()

	y_end = math.ceil(min_y / 100) * 100
	y_begin = math.floor(max_y / 100) * 100

	fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
	fig.update_yaxes(range=[y_begin, y_end])
	# save the plot for the blog:
	fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn")
	return fig