Spaces:

andrewrreed
/

closed-vs-open-arena-elo

Running on CPU Upgrade

App Files Files Community

closed-vs-open-arena-elo / utils.py

andrewrreed HF staff

bug fix + update new model releases dates

33eb9c4 4 months ago

raw

history blame

8.42 kB

	import json
	from datetime import datetime

	from typing import Literal, List

	import pandas as pd
	import plotly.express as px
	from huggingface_hub import HfFileSystem, hf_hub_download

	# from: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/monitor.py#L389
	KEY_TO_CATEGORY_NAME = {
	"full": "Overall",
	"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
	"math": "Math",
	"if": "Instruction Following",
	"multiturn": "Multi-Turn",
	"coding": "Coding",
	"hard_6": "Hard Prompts (Overall)",
	"hard_english_6": "Hard Prompts (English)",
	"long_user": "Longer Query",
	"english": "English",
	"chinese": "Chinese",
	"french": "French",
	"german": "German",
	"spanish": "Spanish",
	"russian": "Russian",
	"japanese": "Japanese",
	"korean": "Korean",
	"no_tie": "Exclude Ties",
	"no_short": "Exclude Short Query (< 5 tokens)",
	"no_refusal": "Exclude Refusal",
	"overall_limit_5_user_vote": "overall_limit_5_user_vote",
	"full_old": "Overall (Deprecated)",
	}

	CAT_NAME_TO_EXPLANATION = {
	"Overall": "Overall Questions",
	"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
	"Math": "Math",
	"Instruction Following": "Instruction Following",
	"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
	"Coding": "Coding: whether conversation contains code snippets",
	"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
	"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
	"Longer Query": "Longer Query (>= 500 tokens)",
	"English": "English Prompts",
	"Chinese": "Chinese Prompts",
	"French": "French Prompts",
	"German": "German Prompts",
	"Spanish": "Spanish Prompts",
	"Russian": "Russian Prompts",
	"Japanese": "Japanese Prompts",
	"Korean": "Korean Prompts",
	"Exclude Ties": "Exclude Ties and Bothbad",
	"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
	"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
	"overall_limit_5_user_vote": "overall_limit_5_user_vote",
	"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
	}

	PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]


	def download_latest_data_from_space(
	repo_id: str, file_type: Literal["pkl", "csv"]
	) -> str:
	"""
	Downloads the latest data file of the specified file type from the given repository space.

	Args:
	repo_id (str): The ID of the repository space.
	file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".

	Returns:
	str: The local file path of the downloaded data file.
	"""

	def extract_date(filename):
	return filename.split("/")[-1].split(".")[0].split("_")[-1]

	fs = HfFileSystem()
	data_file_path = f"spaces/{repo_id}/*.{file_type}"
	files = fs.glob(data_file_path)
	files = [
	file for file in files if "leaderboard_table" in file or "elo_results" in file
	]
	latest_file = sorted(files, key=extract_date, reverse=True)[0]

	latest_filepath_local = hf_hub_download(
	repo_id=repo_id,
	filename=latest_file.split("/")[-1],
	repo_type="space",
	)
	print(latest_file.split("/")[-1])
	return latest_filepath_local


	def get_constants(dfs):
	"""
	Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month.

	Parameters:
	- dfs (dict): A dictionary containing DataFrames for different categories.

	Returns:
	- min_elo_score (float): The minimum Elo score across all DataFrames.
	- max_elo_score (float): The maximum Elo score across all DataFrames.
	- upper_models_per_month (int): The maximum number of models per month per license across all DataFrames.
	"""
	filter_ranges = {}
	for k, df in dfs.items():
	filter_ranges[k] = {
	"min_elo_score": df["rating"].min().round(),
	"max_elo_score": df["rating"].max().round(),
	"upper_models_per_month": int(
	df.groupby(["Month-Year", "License"])["rating"]
	.apply(lambda x: x.count())
	.max()
	),
	}

	min_elo_score = float("inf")
	max_elo_score = float("-inf")
	upper_models_per_month = 0

	for _, value in filter_ranges.items():
	min_elo_score = min(min_elo_score, value["min_elo_score"])
	max_elo_score = max(max_elo_score, value["max_elo_score"])
	upper_models_per_month = max(
	upper_models_per_month, value["upper_models_per_month"]
	)
	return min_elo_score, max_elo_score, upper_models_per_month


	def update_release_date_mapping(
	new_model_keys_to_add: List[str],
	leaderboard_df: pd.DataFrame,
	release_date_mapping: pd.DataFrame,
	) -> pd.DataFrame:
	"""
	Update the release date mapping with new model keys.

	Args:
	new_model_keys_to_add (List[str]): A list of new model keys to add to the release date mapping.
	leaderboard_df (pd.DataFrame): The leaderboard DataFrame containing the model information.
	release_date_mapping (pd.DataFrame): The current release date mapping DataFrame.

	Returns:
	pd.DataFrame: The updated release date mapping DataFrame.
	"""
	# if any, add those to the release date mapping
	if new_model_keys_to_add:
	for key in new_model_keys_to_add:
	new_entry = {
	"key": key,
	"Model": leaderboard_df[leaderboard_df["key"] == key]["Model"].values[
	0
	],
	"Release Date": datetime.today().strftime("%Y-%m-%d"),
	}

	with open("release_date_mapping.json", "r") as file:
	data = json.load(file)

	data.append(new_entry)

	with open("release_date_mapping.json", "w") as file:
	json.dump(data, file, indent=4)

	print(f"Added {key} to release_date_mapping.json")

	# reload the release date mapping
	release_date_mapping = pd.read_json(
	"release_date_mapping.json", orient="records"
	)
	return release_date_mapping


	def format_data(df):
	"""
	Formats the given DataFrame by performing the following operations:
	- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
	- Converts the 'Release Date' column to datetime format.
	- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
	- Rounds the 'rating' column to the nearest integer.
	- Resets the index of the DataFrame.

	Args:
	df (pandas.DataFrame): The DataFrame to be formatted.

	Returns:
	pandas.DataFrame: The formatted DataFrame.
	"""
	df["License"] = df["License"].apply(
	lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
	)
	df["Release Date"] = pd.to_datetime(df["Release Date"])
	df["Month-Year"] = df["Release Date"].dt.to_period("M")
	df["rating"] = df["rating"].round()
	return df.reset_index(drop=True)


	def get_trendlines(fig):

	trend_lines = px.get_trendline_results(fig)

	return [
	trend_lines.iloc[i]["px_fit_results"].params.tolist()
	for i in range(len(trend_lines))
	]


	def find_crossover_point(b1, m1, b2, m2):
	"""
	Determine the X value at which two trendlines will cross over.

	Parameters:
	m1 (float): Slope of the first trendline.
	b1 (float): Intercept of the first trendline.
	m2 (float): Slope of the second trendline.
	b2 (float): Intercept of the second trendline.

	Returns:
	float: The X value where the two trendlines cross.
	"""
	if m1 == m2:
	raise ValueError("The trendlines are parallel and do not cross.")

	x_crossover = (b2 - b1) / (m1 - m2)
	return x_crossover