import gradio as gr from src.utils import model_hyperlink, process_score LEADERBOARD_COLUMN_TO_DATATYPE = { # open llm "Model ๐Ÿค—": "markdown", "Experiment ๐Ÿงช": "str", # primary measurements "Prefill (s)": "number", "Decode (tokens/s)": "number", "Memory (MB)": "number", "Energy (tokens/kWh)": "number", # deployment settings "Backend ๐Ÿญ": "str", "Precision ๐Ÿ“ฅ": "str", "Quantization ๐Ÿ—œ๏ธ": "str", "Attention ๐Ÿ‘๏ธ": "str", "Kernel โš›๏ธ": "str", # additional measurements # "Reserved Memory (MB)": "number", # "Used Memory (MB)": "number", "Open LLM Score (%)": "number", "End-to-End (s)": "number", "Architecture ๐Ÿ›๏ธ": "str", "Params (B)": "number", } PRIMARY_COLUMNS = [ "Model ๐Ÿค—", "Experiment ๐Ÿงช", "Prefill (s)", "Decode (tokens/s)", "Memory (MB)", "Energy (tokens/kWh)", "Open LLM Score (%)", ] CAPABILITY_COLUMNS = [ "Grounding โšก๏ธ", "Instruction Following ๐Ÿ“", "Planning ๐Ÿ“…", "Reasoning ๐Ÿ’ก", "Refinement ๐Ÿ”ฉ", "Safety โš ๏ธ", "Theory of Mind ๐Ÿค”", "Tool Usage ๐Ÿ› ๏ธ", "Multilingual ๐Ÿ‡ฌ๐Ÿ‡ซ", ] BGB_COLUMN_MAPPING = { "model_name_or_path": "Model ๐Ÿค—", "average": "Average", "grounding": "Grounding โšก๏ธ", "instruction_following": "Instruction Following ๐Ÿ“", "planning": "Planning ๐Ÿ“…", "reasoning": "Reasoning ๐Ÿ’ก", "refinement": "Refinement ๐Ÿ”ฉ", "safety": "Safety โš ๏ธ", "theory_of_mind": "Theory of Mind ๐Ÿค”", "tool_usage": "Tool Usage ๐Ÿ› ๏ธ", "multilingual": "Multilingual ๐Ÿ‡ฌ๐Ÿ‡ซ", "model_params": "Model Params (B)", "model_type": "Model Type", } BGB_COLUMN_TO_DATATYPE = { "Model ๐Ÿค—": "markdown", "Average": "number", "Grounding โšก๏ธ": "number", "Instruction Following ๐Ÿ“": "number", "Planning ๐Ÿ“…": "number", "Reasoning ๐Ÿ’ก": "number", "Refinement ๐Ÿ”ฉ": "number", "Safety โš ๏ธ": "number", "Theory of Mind ๐Ÿค”": "number", "Tool Usage ๐Ÿ› ๏ธ": "number", "Multilingual ๐Ÿ‡ฌ๐Ÿ‡ซ": "number", "Model Params (B)": "number", "Model Type": "str", } def process_model(model_name): link = f"https://huggingface.co/{model_name}" return model_hyperlink(link, model_name) # TODO: Process base, chat, proprietary models differently def process_bgb_model(row): model_name = row.iloc[0] model_type = row.iloc[1] if model_type == "Base" or model_type == "Chat": link = f"https://huggingface.co/{model_name}" return model_hyperlink(link, model_name) elif model_type == "Proprietary": api_model_2_link = { "gpt-3.5-turbo-1106": "https://platform.openai.com/docs/models/gpt-3-5", "gpt-3.5-turbo-0125": "https://platform.openai.com/docs/models/gpt-3-5", "gpt-4-0125-preview": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday", "gpt-4-1106-preview": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday", "gpt-4-turbo-2024-04-09": "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4", "gpt-4o-2024-05-13": "https://openai.com/index/hello-gpt-4o/", "claude-3-haiku-20240307": "https://www.anthropic.com/news/claude-3-family", "claude-3-opus-20240229": "https://www.anthropic.com/news/claude-3-family", "claude-3-sonnet-20240229": "https://www.anthropic.com/news/claude-3-family", "mistral-large": "https://mistral.ai/news/mistral-large/", "mistral-medium": "https://mistral.ai/news/la-plateforme/", "gemini-1.0-pro": "https://deepmind.google/technologies/gemini/pro/", "gemini-pro-1.5": "https://deepmind.google/technologies/gemini/pro/", "google/gemini-flash-1.5": "https://deepmind.google/technologies/gemini/flash/", } link = api_model_2_link[model_name] return model_hyperlink(link, model_name) else: raise NotImplementedError(f"Model type {model_type} not implemented") def get_leaderboard_df(llm_perf_df): df = llm_perf_df.copy() # transform for leaderboard df["Model ๐Ÿค—"] = df["Model ๐Ÿค—"].apply(process_bgb_model) # process quantization for leaderboard df["Open LLM Score (%)"] = df.apply(lambda x: process_score(x["Open LLM Score (%)"], x["Quantization ๐Ÿ—œ๏ธ"]), axis=1) return df def get_bgb_leaderboard_df(eval_df): df = eval_df.copy() # transform for leaderboard df["Model ๐Ÿค—"] = df[["Model ๐Ÿค—", "Model Type"]].apply(process_bgb_model, axis=1) return df def create_leaderboard_table(llm_perf_df): # get dataframe leaderboard_df = get_leaderboard_df(llm_perf_df) # create search bar with gr.Row(): search_bar = gr.Textbox( label="Model ๐Ÿค—", info="๐Ÿ” Search for a model name", elem_id="search-bar", ) # create checkboxes with gr.Row(): columns_checkboxes = gr.CheckboxGroup( label="Columns ๐Ÿ“Š", value=PRIMARY_COLUMNS, choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()), info="โ˜‘๏ธ Select the columns to display", elem_id="columns-checkboxes", ) # create table leaderboard_table = gr.components.Dataframe( value=leaderboard_df[PRIMARY_COLUMNS], datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()), headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()), elem_id="leaderboard-table", ) return search_bar, columns_checkboxes, leaderboard_table def create_bgb_leaderboard_table(eval_df): # get dataframe bgb_leaderboard_df = get_bgb_leaderboard_df(eval_df) # create search bar with gr.Row(): search_bar = gr.Textbox( label="Model ๐Ÿค—", info="๐Ÿ” Search for a model name", elem_id="search-bar", ) with gr.Row(): type_checkboxes = gr.CheckboxGroup( label="Model Type", value=["Base", "Chat", "Proprietary"], choices=["Base", "Chat", "Proprietary"], info="โ˜‘๏ธ Select the capabilities to display", elem_id="type-checkboxes", ) with gr.Row(): param_slider = gr.Slider( minimum=0, maximum=150, value=7, step=1, interactive=True, label="Model Params (B)", elem_id="param-slider" ) # create checkboxes with gr.Row(): columns_checkboxes = gr.CheckboxGroup( label="Capabilities ๐Ÿ“Š", value=CAPABILITY_COLUMNS, choices=CAPABILITY_COLUMNS, info="โ˜‘๏ธ Select the capabilities to display", elem_id="columns-checkboxes", ) # create table bgb_leaderboard_table = gr.components.Dataframe( value=bgb_leaderboard_df[list(BGB_COLUMN_MAPPING.values())], datatype=list(BGB_COLUMN_TO_DATATYPE.values()), headers=list(BGB_COLUMN_MAPPING.keys()), elem_id="leaderboard-table", ) return search_bar, columns_checkboxes, type_checkboxes, param_slider, bgb_leaderboard_table