Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 8,424 Bytes
4df8d2a 167137b 4df8d2a 6bd3956 167137b 6fbf558 167137b 6fbf558 33eb9c4 ec01232 167137b 6fbf558 167137b ec01232 167137b 6fbf558 ec01232 167137b ec01232 167137b 6fbf558 33eb9c4 ec01232 167137b 6fbf558 167137b ec01232 167137b 6fbf558 ec01232 167137b 557f1e5 167137b 33eb9c4 167137b 33eb9c4 167137b 4ae93a7 4df8d2a 6bd3956 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
import json
from datetime import datetime
from typing import Literal, List
import pandas as pd
import plotly.express as px
from huggingface_hub import HfFileSystem, hf_hub_download
# from: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/monitor.py#L389
KEY_TO_CATEGORY_NAME = {
"full": "Overall",
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
"math": "Math",
"if": "Instruction Following",
"multiturn": "Multi-Turn",
"coding": "Coding",
"hard_6": "Hard Prompts (Overall)",
"hard_english_6": "Hard Prompts (English)",
"long_user": "Longer Query",
"english": "English",
"chinese": "Chinese",
"french": "French",
"german": "German",
"spanish": "Spanish",
"russian": "Russian",
"japanese": "Japanese",
"korean": "Korean",
"no_tie": "Exclude Ties",
"no_short": "Exclude Short Query (< 5 tokens)",
"no_refusal": "Exclude Refusal",
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"full_old": "Overall (Deprecated)",
}
CAT_NAME_TO_EXPLANATION = {
"Overall": "Overall Questions",
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
"Math": "Math",
"Instruction Following": "Instruction Following",
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
"Coding": "Coding: whether conversation contains code snippets",
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Longer Query": "Longer Query (>= 500 tokens)",
"English": "English Prompts",
"Chinese": "Chinese Prompts",
"French": "French Prompts",
"German": "German Prompts",
"Spanish": "Spanish Prompts",
"Russian": "Russian Prompts",
"Japanese": "Japanese Prompts",
"Korean": "Korean Prompts",
"Exclude Ties": "Exclude Ties and Bothbad",
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
}
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
def download_latest_data_from_space(
repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
"""
Downloads the latest data file of the specified file type from the given repository space.
Args:
repo_id (str): The ID of the repository space.
file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
Returns:
str: The local file path of the downloaded data file.
"""
def extract_date(filename):
return filename.split("/")[-1].split(".")[0].split("_")[-1]
fs = HfFileSystem()
data_file_path = f"spaces/{repo_id}/*.{file_type}"
files = fs.glob(data_file_path)
files = [
file for file in files if "leaderboard_table" in file or "elo_results" in file
]
latest_file = sorted(files, key=extract_date, reverse=True)[0]
latest_filepath_local = hf_hub_download(
repo_id=repo_id,
filename=latest_file.split("/")[-1],
repo_type="space",
)
print(latest_file.split("/")[-1])
return latest_filepath_local
def get_constants(dfs):
"""
Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month.
Parameters:
- dfs (dict): A dictionary containing DataFrames for different categories.
Returns:
- min_elo_score (float): The minimum Elo score across all DataFrames.
- max_elo_score (float): The maximum Elo score across all DataFrames.
- upper_models_per_month (int): The maximum number of models per month per license across all DataFrames.
"""
filter_ranges = {}
for k, df in dfs.items():
filter_ranges[k] = {
"min_elo_score": df["rating"].min().round(),
"max_elo_score": df["rating"].max().round(),
"upper_models_per_month": int(
df.groupby(["Month-Year", "License"])["rating"]
.apply(lambda x: x.count())
.max()
),
}
min_elo_score = float("inf")
max_elo_score = float("-inf")
upper_models_per_month = 0
for _, value in filter_ranges.items():
min_elo_score = min(min_elo_score, value["min_elo_score"])
max_elo_score = max(max_elo_score, value["max_elo_score"])
upper_models_per_month = max(
upper_models_per_month, value["upper_models_per_month"]
)
return min_elo_score, max_elo_score, upper_models_per_month
def update_release_date_mapping(
new_model_keys_to_add: List[str],
leaderboard_df: pd.DataFrame,
release_date_mapping: pd.DataFrame,
) -> pd.DataFrame:
"""
Update the release date mapping with new model keys.
Args:
new_model_keys_to_add (List[str]): A list of new model keys to add to the release date mapping.
leaderboard_df (pd.DataFrame): The leaderboard DataFrame containing the model information.
release_date_mapping (pd.DataFrame): The current release date mapping DataFrame.
Returns:
pd.DataFrame: The updated release date mapping DataFrame.
"""
# if any, add those to the release date mapping
if new_model_keys_to_add:
for key in new_model_keys_to_add:
new_entry = {
"key": key,
"Model": leaderboard_df[leaderboard_df["key"] == key]["Model"].values[
0
],
"Release Date": datetime.today().strftime("%Y-%m-%d"),
}
with open("release_date_mapping.json", "r") as file:
data = json.load(file)
data.append(new_entry)
with open("release_date_mapping.json", "w") as file:
json.dump(data, file, indent=4)
print(f"Added {key} to release_date_mapping.json")
# reload the release date mapping
release_date_mapping = pd.read_json(
"release_date_mapping.json", orient="records"
)
return release_date_mapping
def format_data(df):
"""
Formats the given DataFrame by performing the following operations:
- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
- Converts the 'Release Date' column to datetime format.
- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
- Rounds the 'rating' column to the nearest integer.
- Resets the index of the DataFrame.
Args:
df (pandas.DataFrame): The DataFrame to be formatted.
Returns:
pandas.DataFrame: The formatted DataFrame.
"""
df["License"] = df["License"].apply(
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
)
df["Release Date"] = pd.to_datetime(df["Release Date"])
df["Month-Year"] = df["Release Date"].dt.to_period("M")
df["rating"] = df["rating"].round()
return df.reset_index(drop=True)
def get_trendlines(fig):
trend_lines = px.get_trendline_results(fig)
return [
trend_lines.iloc[i]["px_fit_results"].params.tolist()
for i in range(len(trend_lines))
]
def find_crossover_point(b1, m1, b2, m2):
"""
Determine the X value at which two trendlines will cross over.
Parameters:
m1 (float): Slope of the first trendline.
b1 (float): Intercept of the first trendline.
m2 (float): Slope of the second trendline.
b2 (float): Intercept of the second trendline.
Returns:
float: The X value where the two trendlines cross.
"""
if m1 == m2:
raise ValueError("The trendlines are parallel and do not cross.")
x_crossover = (b2 - b1) / (m1 - m2)
return x_crossover
|