Spaces:

andrewrreed
/

closed-vs-open-arena-elo

Running on CPU Upgrade

File size: 8,424 Bytes

import json
from datetime import datetime

from typing import Literal, List

import pandas as pd
import plotly.express as px
from huggingface_hub import HfFileSystem, hf_hub_download

# from: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/monitor.py#L389
KEY_TO_CATEGORY_NAME = {
    "full": "Overall",
    "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
    "math": "Math",
    "if": "Instruction Following",
    "multiturn": "Multi-Turn",
    "coding": "Coding",
    "hard_6": "Hard Prompts (Overall)",
    "hard_english_6": "Hard Prompts (English)",
    "long_user": "Longer Query",
    "english": "English",
    "chinese": "Chinese",
    "french": "French",
    "german": "German",
    "spanish": "Spanish",
    "russian": "Russian",
    "japanese": "Japanese",
    "korean": "Korean",
    "no_tie": "Exclude Ties",
    "no_short": "Exclude Short Query (< 5 tokens)",
    "no_refusal": "Exclude Refusal",
    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
    "full_old": "Overall (Deprecated)",
}

CAT_NAME_TO_EXPLANATION = {
    "Overall": "Overall Questions",
    "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
    "Math": "Math",
    "Instruction Following": "Instruction Following",
    "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
    "Coding": "Coding: whether conversation contains code snippets",
    "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
    "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
    "Longer Query": "Longer Query (>= 500 tokens)",
    "English": "English Prompts",
    "Chinese": "Chinese Prompts",
    "French": "French Prompts",
    "German": "German Prompts",
    "Spanish": "Spanish Prompts",
    "Russian": "Russian Prompts",
    "Japanese": "Japanese Prompts",
    "Korean": "Korean Prompts",
    "Exclude Ties": "Exclude Ties and Bothbad",
    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
    "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
}

PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]


def download_latest_data_from_space(
    repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
    """
    Downloads the latest data file of the specified file type from the given repository space.

    Args:
        repo_id (str): The ID of the repository space.
        file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".

    Returns:
        str: The local file path of the downloaded data file.
    """

    def extract_date(filename):
        return filename.split("/")[-1].split(".")[0].split("_")[-1]

    fs = HfFileSystem()
    data_file_path = f"spaces/{repo_id}/*.{file_type}"
    files = fs.glob(data_file_path)
    files = [
        file for file in files if "leaderboard_table" in file or "elo_results" in file
    ]
    latest_file = sorted(files, key=extract_date, reverse=True)[0]

    latest_filepath_local = hf_hub_download(
        repo_id=repo_id,
        filename=latest_file.split("/")[-1],
        repo_type="space",
    )
    print(latest_file.split("/")[-1])
    return latest_filepath_local


def get_constants(dfs):
    """
    Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month.

    Parameters:
    - dfs (dict): A dictionary containing DataFrames for different categories.

    Returns:
    - min_elo_score (float): The minimum Elo score across all DataFrames.
    - max_elo_score (float): The maximum Elo score across all DataFrames.
    - upper_models_per_month (int): The maximum number of models per month per license across all DataFrames.
    """
    filter_ranges = {}
    for k, df in dfs.items():
        filter_ranges[k] = {
            "min_elo_score": df["rating"].min().round(),
            "max_elo_score": df["rating"].max().round(),
            "upper_models_per_month": int(
                df.groupby(["Month-Year", "License"])["rating"]
                .apply(lambda x: x.count())
                .max()
            ),
        }

    min_elo_score = float("inf")
    max_elo_score = float("-inf")
    upper_models_per_month = 0

    for _, value in filter_ranges.items():
        min_elo_score = min(min_elo_score, value["min_elo_score"])
        max_elo_score = max(max_elo_score, value["max_elo_score"])
        upper_models_per_month = max(
            upper_models_per_month, value["upper_models_per_month"]
        )
    return min_elo_score, max_elo_score, upper_models_per_month


def update_release_date_mapping(
    new_model_keys_to_add: List[str],
    leaderboard_df: pd.DataFrame,
    release_date_mapping: pd.DataFrame,
) -> pd.DataFrame:
    """
    Update the release date mapping with new model keys.

    Args:
        new_model_keys_to_add (List[str]): A list of new model keys to add to the release date mapping.
        leaderboard_df (pd.DataFrame): The leaderboard DataFrame containing the model information.
        release_date_mapping (pd.DataFrame): The current release date mapping DataFrame.

    Returns:
        pd.DataFrame: The updated release date mapping DataFrame.
    """
    # if any, add those to the release date mapping
    if new_model_keys_to_add:
        for key in new_model_keys_to_add:
            new_entry = {
                "key": key,
                "Model": leaderboard_df[leaderboard_df["key"] == key]["Model"].values[
                    0
                ],
                "Release Date": datetime.today().strftime("%Y-%m-%d"),
            }

            with open("release_date_mapping.json", "r") as file:
                data = json.load(file)

            data.append(new_entry)

            with open("release_date_mapping.json", "w") as file:
                json.dump(data, file, indent=4)

            print(f"Added {key} to release_date_mapping.json")

        # reload the release date mapping
        release_date_mapping = pd.read_json(
            "release_date_mapping.json", orient="records"
        )
    return release_date_mapping


def format_data(df):
    """
    Formats the given DataFrame by performing the following operations:
    - Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
    - Converts the 'Release Date' column to datetime format.
    - Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
    - Rounds the 'rating' column to the nearest integer.
    - Resets the index of the DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame to be formatted.

    Returns:
        pandas.DataFrame: The formatted DataFrame.
    """
    df["License"] = df["License"].apply(
        lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
    )
    df["Release Date"] = pd.to_datetime(df["Release Date"])
    df["Month-Year"] = df["Release Date"].dt.to_period("M")
    df["rating"] = df["rating"].round()
    return df.reset_index(drop=True)


def get_trendlines(fig):

    trend_lines = px.get_trendline_results(fig)

    return [
        trend_lines.iloc[i]["px_fit_results"].params.tolist()
        for i in range(len(trend_lines))
    ]


def find_crossover_point(b1, m1, b2, m2):
    """
    Determine the X value at which two trendlines will cross over.

    Parameters:
    m1 (float): Slope of the first trendline.
    b1 (float): Intercept of the first trendline.
    m2 (float): Slope of the second trendline.
    b2 (float): Intercept of the second trendline.

    Returns:
    float: The X value where the two trendlines cross.
    """
    if m1 == m2:
        raise ValueError("The trendlines are parallel and do not cross.")

    x_crossover = (b2 - b1) / (m1 - m2)
    return x_crossover