Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import pickle | |
import pandas as pd | |
import numpy as np | |
import gradio as gr | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
from apscheduler.schedulers.background import BackgroundScheduler | |
import plotly.graph_objects as go | |
from utils import ( | |
KEY_TO_CATEGORY_NAME, | |
CAT_NAME_TO_EXPLANATION, | |
download_latest_data_from_space, | |
get_constants, | |
update_release_date_mapping, | |
format_data, | |
get_trendlines, | |
find_crossover_point, | |
sigmoid_transition, | |
apply_template, | |
) | |
################### | |
### Initialize scheduler | |
################### | |
# def restart_space(): | |
# HfApi(token=os.getenv("HF_TOKEN", None)).restart_space( | |
# repo_id="m-ric/llm-race-to-the-top" | |
# ) | |
# print(f"Space restarted on {datetime.now()}") | |
# # restart the space every day at 9am | |
# scheduler = BackgroundScheduler() | |
# scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0) | |
# scheduler.start() | |
################### | |
### Load Data | |
################### | |
# gather ELO data | |
latest_elo_file_local = download_latest_data_from_space( | |
repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl" | |
) | |
with open(latest_elo_file_local, "rb") as fin: | |
elo_results = pickle.load(fin) | |
# TO-DO: need to also include vision | |
elo_results = elo_results["text"] | |
arena_dfs = {} | |
for k in KEY_TO_CATEGORY_NAME.keys(): | |
if k not in elo_results: | |
continue | |
arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"] | |
# gather open llm leaderboard data | |
latest_leaderboard_file_local = download_latest_data_from_space( | |
repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv" | |
) | |
leaderboard_df = pd.read_csv(latest_leaderboard_file_local) | |
# load release date mapping data | |
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records") | |
################### | |
### Prepare Data | |
################### | |
# update release date mapping with new models | |
# check for new models in ELO data | |
new_model_keys_to_add = [ | |
model | |
for model in arena_dfs["Overall"].index.to_list() | |
if model not in release_date_mapping["key"].to_list() | |
] | |
if new_model_keys_to_add: | |
release_date_mapping = update_release_date_mapping( | |
new_model_keys_to_add, leaderboard_df, release_date_mapping | |
) | |
# merge leaderboard data with ELO data | |
merged_dfs = {} | |
for k, v in arena_dfs.items(): | |
merged_dfs[k] = ( | |
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key") | |
.sort_values("rating", ascending=False) | |
.reset_index(drop=True) | |
) | |
# add release dates into the merged data | |
for k, v in merged_dfs.items(): | |
merged_dfs[k] = pd.merge( | |
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key" | |
) | |
# format dataframes | |
merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()} | |
# get constants | |
min_elo_score, max_elo_score, _ = get_constants(merged_dfs) | |
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0] | |
ratings_df = merged_dfs["Overall"] | |
ratings_df = ratings_df.loc[~ratings_df["Release Date"].isna()] | |
ratings_df["Organization"] = ratings_df["Organization"].apply(lambda x: "DeepSeek" if x == "DeepSeek AI" else x) | |
################### | |
### Build and Plot Data | |
################### | |
def get_data_split(dfs, set_name): | |
df = dfs[set_name].copy(deep=True) | |
return df.reset_index(drop=True) | |
def clean_df_for_display(df): | |
df = df.loc[ | |
:, | |
[ | |
"Model", | |
"rating", | |
"MMLU", | |
"MT-bench (score)", | |
"Release Date", | |
"Organization", | |
"License", | |
"Link", | |
], | |
].rename(columns={"rating": "ELO Score", "MT-bench (score)": "MT-Bench"}) | |
df["Release Date"] = df["Release Date"].astype(str) | |
df.sort_values("ELO Score", ascending=False, inplace=True) | |
df.reset_index(drop=True, inplace=True) | |
return df | |
def format_data(df): | |
""" | |
Formats the given DataFrame by performing the following operations: | |
- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'. | |
- Converts the 'Release Date' column to datetime format. | |
- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column. | |
- Rounds the 'rating' column to the nearest integer. | |
- Resets the index of the DataFrame. | |
Args: | |
df (pandas.DataFrame): The DataFrame to be formatted. | |
Returns: | |
pandas.DataFrame: The formatted DataFrame. | |
""" | |
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"] | |
df["License"] = df["License"].apply( | |
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM" | |
) | |
df["Release Date"] = pd.to_datetime(df["Release Date"]) | |
df["Month-Year"] = df["Release Date"].dt.to_period("M") | |
df["rating"] = df["rating"].round() | |
return df.reset_index(drop=True) | |
# Define organization to country mapping and colors | |
org_info = { | |
"OpenAI": ("#00A67E", "๐บ๐ธ"), # Teal | |
"Google": ("#4285F4", "๐บ๐ธ"), # Google Blue | |
"xAI": ("black", "๐บ๐ธ"), # Bright Orange | |
"Anthropic": ("#cc785c", "๐บ๐ธ"), # Brown (as requested) | |
"Meta": ("#0064E0", "๐บ๐ธ"), # Facebook Blue | |
"Alibaba": ("#6958cf", "๐จ๐ณ"), | |
"DeepSeek": ("#9900CC", "๐จ๐ณ"), | |
"01 AI": ("#11871e", "๐จ๐ณ"), # Bright Green | |
"DeepSeek AI": ("#9900CC", "๐จ๐ณ"), # Purple | |
"Mistral": ("#ff7000", "๐ซ๐ท"), # Mistral Orange (as requested) | |
"AI21 Labs": ("#1E90FF", "๐ฎ๐ฑ"), # Dodger Blue, | |
"Reka AI": ("#FFC300", "๐บ๐ธ"), | |
"Zhipu AI": ("#FFC300", "๐จ๐ณ"), | |
"Nvidia": ("#76B900", "๐บ๐ธ"), | |
} | |
def make_figure(original_df, start_time_gradio, speak_french): | |
fig = go.Figure() | |
start_date = pd.to_datetime(start_time_gradio, unit='s') | |
df = original_df.copy(deep=True) | |
df["Release Date"] = pd.to_datetime(df["Release Date"]) | |
for i, org in enumerate( | |
df.groupby("Organization")["rating"] | |
.max() | |
.sort_values(ascending=False) | |
.index.tolist() | |
): | |
org_data = df[df["Organization"] == org] | |
if len(org_data) > 0: | |
x_values = [] | |
y_values = [] | |
current_best = -np.inf | |
best_models = [] | |
# Group by date and get the best model for each date | |
daily_best = org_data.groupby("Release Date").first().reset_index() | |
for _, row in daily_best.iterrows(): | |
if row["rating"] > current_best: | |
if len(x_values) > 0: | |
# Create smooth transition | |
transition_days = (row["Release Date"] - x_values[-1]).days | |
transition_points = pd.date_range( | |
x_values[-1], | |
row["Release Date"], | |
periods=max(100, transition_days), | |
) | |
x_values.extend(transition_points) | |
transition_y = current_best + ( | |
row["rating"] - current_best | |
) * sigmoid_transition( | |
np.linspace(-6, 6, len(transition_points)), 0, k=1 | |
) | |
y_values.extend(transition_y) | |
x_values.append(row["Release Date"]) | |
y_values.append(row["rating"]) | |
current_best = row["rating"] | |
best_models.append(row) | |
# Extend the line to the current date | |
current_date = pd.Timestamp.now() | |
if x_values[-1] < current_date: | |
x_values.append(current_date) | |
y_values.append(current_best) | |
# Get org color and flag | |
color, flag = org_info.get(org, ("#808080", "")) | |
# Add line plot | |
fig.add_trace( | |
go.Scatter( | |
x=x_values, | |
y=y_values, | |
mode="lines", | |
name=f"{i+1}. {org} {flag}", | |
line=dict(color=color, width=2), | |
hoverinfo="skip", | |
) | |
) | |
# Add scatter plot for best model points | |
best_models_df = pd.DataFrame(best_models) | |
fig.add_trace( | |
go.Scatter( | |
x=best_models_df["Release Date"], | |
y=best_models_df["rating"], | |
mode="markers", | |
name=org, | |
showlegend=False, | |
marker=dict(color=color, size=8, symbol="circle"), | |
text=best_models_df["Model"], | |
hovertemplate="<b>%{text}</b><br>Date: %{x}<br>ELO Score: %{y:.2f}<extra></extra>", | |
) | |
) | |
# Update layout | |
if speak_french: | |
fig.update_layout( | |
title="La course au classement", | |
yaxis_title="Score ELO", | |
legend_title="Classement en Novembre 2024", | |
) | |
else: | |
fig.update_layout( | |
yaxis_title="ELO score on Chatbot Arena", | |
legend_title="Ranking as of November 2024", | |
title="The race for the best LLM", | |
) | |
print("START TIME:", start_time) | |
margin = 30 | |
fig.update_layout( | |
xaxis_title="Date", | |
hovermode="closest", | |
xaxis_range=[start_date, current_date], # Extend x-axis for labels | |
yaxis_range=[df.loc[df["Release Date"] >= start_date]["rating"].min()+margin, df["rating"].max() + margin], | |
) | |
apply_template(fig, annotation_text="Aymeric Roucher", height=600) | |
fig.update_xaxes( | |
tickformat="%m-%Y", | |
) | |
return fig, df | |
def filter_df(top_n_orgs=11, minimum_rating=1000): | |
top_orgs = ratings_df.groupby("Organization")["rating"].max().nlargest(int(top_n_orgs)).index.tolist() | |
return ratings_df.loc[(ratings_df["Organization"].isin(top_orgs))] | |
with gr.Blocks( | |
theme=gr.themes.Soft( | |
primary_hue=gr.themes.colors.sky, | |
secondary_hue=gr.themes.colors.green, | |
# spacing_size=gr.themes.sizes.spacing_sm, | |
text_size=gr.themes.sizes.text_sm, | |
font=[ | |
gr.themes.GoogleFont("Open Sans"), | |
"ui-serif", | |
"system-ui", | |
"serif", | |
], | |
), | |
) as demo: | |
filtered_df = gr.State() | |
with gr.Row(): | |
top_n_orgs = gr.Slider(minimum=1, maximum=15, value=11, step=1, label="View top N companies") | |
# minimum_rating = gr.Slider(minimum=800, maximum=1300, value=1000, step=1, label="Restrict to ELO scores above N") | |
start_time = gr.DateTime(value="2024-01-01 00:00:00", label="Start time") | |
speak_french = gr.Checkbox(value=False, label="Parler franรงais") | |
with gr.Group(): | |
with gr.Tab("Plot"): | |
plot = gr.Plot(show_label=False) | |
with gr.Tab("Raw Data"): | |
display_df = gr.DataFrame() | |
gr.Markdown( | |
""" | |
This app visualizes the progress of LLMs over time as scored by the [LMSYS Chatbot Arena](https://leaderboard.lmsys.org/). | |
The app is adapted from [this app](https://huggingface.co/spaces/andrewrreed/closed-vs-open-arena-elo) by Andew Reed, | |
and is intended to stay up-to-date as new models are released and evaluated. | |
> ### Plot info | |
> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena. | |
> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates). | |
> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria. | |
""" | |
) | |
demo.load( | |
fn=filter_df, | |
inputs=[top_n_orgs], | |
outputs=filtered_df, | |
).then( | |
fn=make_figure, | |
inputs=[filtered_df, start_time, speak_french], | |
outputs=[plot, display_df], | |
) | |
top_n_orgs.change( | |
fn=filter_df, | |
inputs=[top_n_orgs], | |
outputs=filtered_df, | |
).then( | |
fn=make_figure, | |
inputs=[filtered_df, start_time, speak_french], | |
outputs=[plot, display_df], | |
) | |
start_time.change( | |
fn=make_figure, | |
inputs=[filtered_df, start_time, speak_french], | |
outputs=[plot, display_df], | |
) | |
speak_french.change( | |
fn=make_figure, | |
inputs=[filtered_df, start_time, speak_french], | |
outputs=[plot, display_df], | |
) | |
demo.launch() | |