We express our sincere gratitude to NetMind.AI for their generous donation of GPUs, which plays a crucial role in ensuring the continuous operation of our Leaderboard.
-{error}
" - - -def styled_warning(warn): - return f"{warn}
" - - -def styled_message(message): - return f"{message}
" - - -def has_no_nan_values(df, columns): - return df[columns].notna().all(axis=1) - - -def has_nan_values(df, columns): - return df[columns].isna().any(axis=1) diff --git a/open-moe-llm-leaderboard-gh/src/display/imgs/Netmind.AI_LOGO.jpg b/open-moe-llm-leaderboard-gh/src/display/imgs/Netmind.AI_LOGO.jpg deleted file mode 100644 index 6ccff65e32b3fa0545a66ee3937df979ed542891..0000000000000000000000000000000000000000 Binary files a/open-moe-llm-leaderboard-gh/src/display/imgs/Netmind.AI_LOGO.jpg and /dev/null differ diff --git a/open-moe-llm-leaderboard-gh/src/display/utils.py b/open-moe-llm-leaderboard-gh/src/display/utils.py deleted file mode 100644 index 2dc5c094370da143b544a76c71079b690ed86ebf..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/display/utils.py +++ /dev/null @@ -1,274 +0,0 @@ -from dataclasses import dataclass, make_dataclass -from enum import Enum - -import pandas as pd - - -def fields(raw_class): - return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] - -E2Es = "E2E(s)" #"End-to-end time (s)" -PREs = "PRE(s)" #"Prefilling time (s)" -TS = "T/s" #Decoding throughput (tok/s) -InFrame = "Method" #"Inference framework" -MULTIPLE_CHOICEs = ["mmlu"] - -GPU_TEMP = 'Temp(C)' -GPU_Power = 'Power(W)' -GPU_Mem = 'Mem(G)' -GPU_Name = "GPU" -GPU_Util = 'Util(%)' -MFU = 'MFU(%)' -MBU = 'MBU(%)' -BATCH_SIZE = 'bs' -PRECISION = "Precision" -system_metrics_to_name_map = { - "end_to_end_time": f"{E2Es}", - "prefilling_time": f"{PREs}", - "decoding_throughput": f"{TS}", - "mfu": f"{MFU}", - "mbu": f"{MBU}" -} - -gpu_metrics_to_name_map = { - GPU_Util: GPU_Util, - GPU_TEMP: GPU_TEMP, - GPU_Power: GPU_Power, - GPU_Mem: GPU_Mem, - "batch_size": BATCH_SIZE, - "precision": PRECISION, - GPU_Name: GPU_Name, - MFU: MFU, - MBU: MBU -} - -@dataclass -class Task: - benchmark: str - metric: str - col_name: str - - -class Tasks(Enum): - # XXX include me back at some point - # nqopen = Task("nq8", "em", "NQ Open/EM") - # triviaqa = Task("tqa8", "em", "TriviaQA/EM") - - # truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthQA MC1/Acc") - # truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc") - # truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE") - - # xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE") - # xsum_f = Task("xsum_v2", "factKB", "XSum/factKB") - # xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P") - - # cnndm_r = Task("cnndm_v2", "rougeL", "CNN-DM/ROUGE") - # cnndm_f = Task("cnndm_v2", "factKB", "CNN-DM/factKB") - # cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P") - - # race = Task("race", "acc", "RACE/Acc") - # squadv2 = Task("squadv2", "exact", "SQUaDv2/EM") - - # memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc") - # ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc") - - # faithdial = Task("faithdial_hallu_v2", "acc", "FaithDial/Acc") - - # halueval_qa = Task("halueval_qa", "acc", "HaluQA/Acc") - # halueval_summ = Task("halueval_summarization", "acc", "HaluSumm/Acc") - # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc") - - # # XXX include me back at some point - selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT") - mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot) - gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot) - gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot) - - -# These classes are for user facing column names, -# to avoid having to change them all around the code -# when a modif is needed -@dataclass -class ColumnContent: - name: str - type: str - displayed_by_default: bool - hidden: bool = False - never_hidden: bool = False - dummy: bool = False - - -auto_eval_column_dict = [] -# Init -auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) -auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) - -# #Scores -# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)]) - -# Inference framework -auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)]) - -for task in Tasks: - auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) - # System performance metrics - auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)]) - # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)]) - if task.value.benchmark in MULTIPLE_CHOICEs: - continue - # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)]) - auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)]) - - -# Model information -auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) -auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) -auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) -auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)]) -auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) -auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) -auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) -auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) -auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) -# Dummy column for the search bar (hidden by the custom CSS) -auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)]) - -# We use make dataclass to dynamically fill the scores from Tasks -AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) - - -@dataclass(frozen=True) -class EvalQueueColumn: # Queue column - model = ColumnContent("model", "markdown", True) - revision = ColumnContent("revision", "str", True) - private = ColumnContent("private", "bool", True) - precision = ColumnContent("precision", "str", True) - weight_type = ColumnContent("weight_type", "str", "Original") - model_framework = ColumnContent("inference_framework", "str", True) - status = ColumnContent("status", "str", True) - - -@dataclass -class ModelDetails: - name: str - symbol: str = "" # emoji, only for the model type - - -class ModelType(Enum): - PT = ModelDetails(name="pretrained", symbol="🟢") - FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶") - chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬") - merges = ModelDetails(name="base merges and moerges", symbol="🤝") - Unknown = ModelDetails(name="", symbol="?") - - def to_str(self, separator=" "): - return f"{self.value.symbol}{separator}{self.value.name}" - - @staticmethod - def from_str(type): - if "fine-tuned" in type or "🔶" in type: - return ModelType.FT - if "pretrained" in type or "🟢" in type: - return ModelType.PT - if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]): - return ModelType.chat - if "merge" in type or "🤝" in type: - return ModelType.merges - return ModelType.Unknown - - -class InferenceFramework(Enum): - # "moe-infinity", hf-chat - MoE_Infinity = ModelDetails("moe-infinity") - HF_Chat = ModelDetails("hf-chat") - Unknown = ModelDetails("?") - - def to_str(self): - return self.value.name - - @staticmethod - def from_str(inference_framework: str): - if inference_framework in ["moe-infinity"]: - return InferenceFramework.MoE_Infinity - if inference_framework in ["hf-chat"]: - return InferenceFramework.HF_Chat - return InferenceFramework.Unknown - -class GPUType(Enum): - H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB") - A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB") - A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB") - Unknown = ModelDetails("?") - - def to_str(self): - return self.value.name - - @staticmethod - def from_str(gpu_type: str): - if gpu_type in ["NVIDIA-H100-PCIe-80GB"]: - return GPUType.A100_pcie - if gpu_type in ["NVIDIA-A100-PCIe-80GB"]: - return GPUType.H100_pcie - if gpu_type in ["NVIDIA-A5000-24GB"]: - return GPUType.A5000 - return GPUType.Unknown - -class WeightType(Enum): - Adapter = ModelDetails("Adapter") - Original = ModelDetails("Original") - Delta = ModelDetails("Delta") - - -class Precision(Enum): - float32 = ModelDetails("float32") - float16 = ModelDetails("float16") - bfloat16 = ModelDetails("bfloat16") - qt_8bit = ModelDetails("8bit") - qt_4bit = ModelDetails("4bit") - qt_GPTQ = ModelDetails("GPTQ") - Unknown = ModelDetails("?") - - @staticmethod - def from_str(precision: str): - if precision in ["torch.float32", "float32"]: - return Precision.float32 - if precision in ["torch.float16", "float16"]: - return Precision.float16 - if precision in ["torch.bfloat16", "bfloat16"]: - return Precision.bfloat16 - if precision in ["8bit"]: - return Precision.qt_8bit - if precision in ["4bit"]: - return Precision.qt_4bit - if precision in ["GPTQ", "None"]: - return Precision.qt_GPTQ - return Precision.Unknown - - -# Column selection -COLS = [c.name for c in fields(AutoEvalColumn)] -TYPES = [c.type for c in fields(AutoEvalColumn)] -COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] -TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] - -EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] -EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] - -BENCHMARK_COLS = [t.value.col_name for t in Tasks] - -# NUMERIC_INTERVALS = { -# "?": pd.Interval(-1, 0, closed="right"), -# "~1.5": pd.Interval(0, 2, closed="right"), -# "~3": pd.Interval(2, 4, closed="right"), -# "~7": pd.Interval(4, 9, closed="right"), -# "~13": pd.Interval(9, 20, closed="right"), -# "~35": pd.Interval(20, 45, closed="right"), -# "~60": pd.Interval(45, 70, closed="right"), -# "70+": pd.Interval(70, 10000, closed="right"), -# } diff --git a/open-moe-llm-leaderboard-gh/src/envs.py b/open-moe-llm-leaderboard-gh/src/envs.py deleted file mode 100644 index 0ee354bb13392b1c1a3abc26343ee8401b7239f0..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/envs.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from huggingface_hub import HfApi - -# clone / pull the lmeh eval data -H4_TOKEN = os.environ.get("H4_TOKEN", None) - -# REPO_ID = "pminervini/sparse-generative-ai" -REPO_ID = "sparse-generative-ai/open-moe-llm-leaderboard" - -QUEUE_REPO = "sparse-generative-ai/requests" -QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests" -RESULTS_REPO = "sparse-generative-ai/results" - -DEBUG_QUEUE_REPO = "sparse-generative-ai/debug_requests" -DEBUG_RESULTS_REPO = "sparse-generative-ai/debug_results" - -IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True)) - -CACHE_PATH = os.getenv("HF_HOME", ".") - -EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") -EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") -EVAL_REQUESTS_PATH_OPEN_LLM = os.path.join(CACHE_PATH, "eval-queue-open-llm") - -EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private" -EVAL_RESULTS_PATH_PRIVATE = "eval-results-private" - -PATH_TO_COLLECTION = "sparse-generative-ai/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03" - -# Rate limit variables -RATE_LIMIT_PERIOD = 7 -RATE_LIMIT_QUOTA = 5 -HAS_HIGHER_RATE_LIMIT = ["TheBloke"] - -API = HfApi(token=H4_TOKEN) diff --git a/open-moe-llm-leaderboard-gh/src/leaderboard/filter_models.py b/open-moe-llm-leaderboard-gh/src/leaderboard/filter_models.py deleted file mode 100644 index efbe83cf4d7203fca388b7afd1d801bd00dfc626..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/leaderboard/filter_models.py +++ /dev/null @@ -1,50 +0,0 @@ -from src.display.formatting import model_hyperlink -from src.display.utils import AutoEvalColumn - -# Models which have been flagged by users as being problematic for a reason or another -# (Model name to forum discussion link) -FLAGGED_MODELS = { - "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202", - "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207", - "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213", - "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236", - "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237", - "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215", - "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287", - "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287", - "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287", -} - -# Models which have been requested by orgs to not be submitted on the leaderboard -DO_NOT_SUBMIT_MODELS = [ - "Voicelab/trurl-2-13b", # trained on MMLU -] - - -def flag_models(leaderboard_data: list[dict]): - for model_data in leaderboard_data: - if model_data["model_name_for_query"] in FLAGGED_MODELS: - issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1] - issue_link = model_hyperlink( - FLAGGED_MODELS[model_data["model_name_for_query"]], - f"See discussion #{issue_num}", - ) - model_data[AutoEvalColumn.model.name] = ( - f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}" - ) - - -def remove_forbidden_models(leaderboard_data: list[dict]): - indices_to_remove = [] - for ix, model in enumerate(leaderboard_data): - if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS: - indices_to_remove.append(ix) - - for ix in reversed(indices_to_remove): - leaderboard_data.pop(ix) - return leaderboard_data - - -def filter_models(leaderboard_data: list[dict]): - leaderboard_data = remove_forbidden_models(leaderboard_data) - flag_models(leaderboard_data) diff --git a/open-moe-llm-leaderboard-gh/src/leaderboard/read_evals.py b/open-moe-llm-leaderboard-gh/src/leaderboard/read_evals.py deleted file mode 100644 index bd75bb4d916a9843e6f1670850d827734e91f945..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/leaderboard/read_evals.py +++ /dev/null @@ -1,290 +0,0 @@ -import glob -import json -import os -from tqdm import tqdm -from dataclasses import dataclass - -import dateutil - -# import numpy as np - -from src.display.formatting import make_clickable_model -from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType -from src.submission.check_validity import is_model_on_hub - -from typing import Optional - - -def is_float(string): - try: - float(string) - return True - except ValueError: - return False - - -@dataclass -class EvalResult: - # Also see src.display.utils.AutoEvalColumn for what will be displayed. - eval_name: str # org_model_precision (uid) - full_model: str # org/model (path on hub) - org: str - model: str - revision: str # commit hash, "" if main - results: dict - precision: Precision = Precision.Unknown - model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... - weight_type: WeightType = WeightType.Original # Original or Adapter - architecture: str = "Unknown" # From config file - license: str = "?" - likes: int = 0 - num_params: int = 0 - date: str = "" # submission date of request file - still_on_hub: bool = False - inference_framework: str = "Unknown" - - @staticmethod - def init_from_json_file(json_filepath, is_backend: bool = False): - """Inits the result from the specific model result file""" - with open(json_filepath) as fp: - data = json.load(fp) - - # We manage the legacy config format - config = data.get("config", data.get("config_general", None)) - - # Precision - precision = Precision.from_str(config.get("model_dtype")) - - # Get model and org - org_and_model = config.get("model_name", config.get("model_args", None)) - org_and_model = org_and_model.split("/", 1) - - # Get inference framework - inference_framework = config.get("inference_framework", "Unknown") - - if len(org_and_model) == 1: - org = None - model = org_and_model[0] - result_key = f"{model}_{precision.value.name}" - else: - org = org_and_model[0] - model = org_and_model[1] - result_key = f"{org}_{model}_{precision.value.name}" - full_model = "/".join(org_and_model) - - still_on_hub, error, model_config = is_model_on_hub( - full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False - ) - architecture = "?" - if model_config is not None: - architectures = getattr(model_config, "architectures", None) - if architectures: - architecture = ";".join(architectures) - - # Extract results available in this file (some results are split in several files) - - # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}} - - results = {} - for benchmark, benchmark_results in data["results"].items(): - if benchmark not in results: - results[benchmark] = {} - - for metric, value in benchmark_results.items(): - to_add = True - if "_stderr" in metric: - to_add = False - if "alias" in metric: - to_add = False - - if "," in metric: - metric = metric.split(",")[0] - metric = metric.replace("exact_match", "em") - - if to_add is True: - multiplier = 100.0 - if "GPU" in metric: - results[benchmark][metric] = value - continue - if "precision" in metric: - results[benchmark][metric] = value - continue - - if "rouge" in metric and "truthful" not in benchmark: - multiplier = 1.0 - if "squad" in benchmark: - multiplier = 1.0 - if "time" in metric: - multiplier = 1.0 - if "throughput" in metric: - multiplier = 1.0 - if "batch_" in metric or "Mem" in metric or "Util" in metric: - multiplier = 1 - - - # print('RESULTS', data['results']) - # print('XXX', benchmark, metric, value, multiplier) - results[benchmark][metric] = value * multiplier - - res = EvalResult( - eval_name=result_key, - full_model=full_model, - org=org, - model=model, - results=results, - precision=precision, - revision=config.get("model_sha", ""), - still_on_hub=still_on_hub, - architecture=architecture, - inference_framework=inference_framework, - ) - - return res - - def update_with_request_file(self, requests_path): - """Finds the relevant request file for the current model and updates info with it""" - request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) - - try: - with open(request_file, "r") as f: - request = json.load(f) - - self.model_type = ModelType.from_str(request.get("model_type", "")) - self.weight_type = WeightType[request.get("weight_type", "Original")] - self.license = request.get("license", "?") - self.likes = request.get("likes", 0) - self.num_params = request.get("params", 0) - self.date = request.get("submitted_time", "") - self.inference_framework = request.get("inference_framework", "Unknown") - except Exception as e: - print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}") - - def is_complete(self) -> bool: - for task in Tasks: - if task.value.benchmark not in self.results: - return False - return True - - def to_dict(self): - """Converts the Eval Result to a dict compatible with our dataframe display""" - - # breakpoint() - # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks) - - data_dict = { - "eval_name": self.eval_name, # not a column, just a save name, - AutoEvalColumn.precision.name: self.precision.value.name, - AutoEvalColumn.model_type.name: self.model_type.value.name, - AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, - AutoEvalColumn.weight_type.name: self.weight_type.value.name, - AutoEvalColumn.architecture.name: self.architecture, - AutoEvalColumn.model.name: make_clickable_model(self.full_model), - AutoEvalColumn.dummy.name: self.full_model, - AutoEvalColumn.revision.name: self.revision, - # AutoEvalColumn.average.name: average, - AutoEvalColumn.license.name: self.license, - AutoEvalColumn.likes.name: self.likes, - AutoEvalColumn.params.name: self.num_params, - AutoEvalColumn.still_on_hub.name: self.still_on_hub, - AutoEvalColumn.inference_framework.name: self.inference_framework, - } - - for task in Tasks: - if task.value.benchmark in self.results: - data_dict[task.value.col_name] = self.results[task.value.benchmark] - - return data_dict - - -def get_request_file_for_model(requests_path, model_name, precision): - """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED and RUNNING""" - request_files = os.path.join( - requests_path, - f"{model_name}_eval_request_*.json", - ) - request_files = glob.glob(request_files) - - # Select correct request file (precision) - request_file = "" - request_files = sorted(request_files, reverse=True) - - for tmp_request_file in request_files: - with open(tmp_request_file, "r") as f: - req_content = json.load(f) - if req_content["precision"] == precision.split(".")[-1]: - request_file = tmp_request_file - return request_file - - -def get_request_file_for_model_open_llm(requests_path, model_name, precision): - """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" - request_files = os.path.join( - requests_path, - f"{model_name}_eval_request_*.json", - ) - request_files = glob.glob(request_files) - - # Select correct request file (precision) - request_file = "" - request_files = sorted(request_files, reverse=True) - for tmp_request_file in request_files: - with open(tmp_request_file, "r") as f: - req_content = json.load(f) - if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]: - request_file = tmp_request_file - return request_file - - -def update_model_type_with_open_llm_request_file(result, open_llm_requests_path): - """Finds the relevant request file for the current model and updates info with it""" - request_file = get_request_file_for_model_open_llm( - open_llm_requests_path, result.full_model, result.precision.value.name - ) - - if request_file: - try: - with open(request_file, "r") as f: - request = json.load(f) - open_llm_model_type = request.get("model_type", "Unknown") - if open_llm_model_type != "Unknown": - result.model_type = ModelType.from_str(open_llm_model_type) - except Exception as e: - pass - return result - - -def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]: - """From the path of the results folder root, extract all needed info for results""" - model_result_filepaths = [] - - for root, _, files in os.walk(results_path): - # We should only have json files in model results - if len(files) == 0 or any([not f.endswith(".json") for f in files]): - continue - - # Sort the files by date - try: - files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) - except dateutil.parser._parser.ParserError: - files = [files[-1]] - - for file in files: - model_result_filepaths.append(os.path.join(root, file)) - - eval_results = {} - for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"): - # Creation of result - eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend) - eval_result.update_with_request_file(requests_path) - # Store results of same eval together - eval_name = eval_result.eval_name - if eval_name in eval_results.keys(): - eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) - else: - eval_results[eval_name] = eval_result - - results = [] - for v in eval_results.values(): - results.append(v) - - return results diff --git a/open-moe-llm-leaderboard-gh/src/populate.py b/open-moe-llm-leaderboard-gh/src/populate.py deleted file mode 100644 index 9d003dd07edf0590f4f84844e73743bcb67c0a19..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/populate.py +++ /dev/null @@ -1,120 +0,0 @@ -import json -import os -from tqdm import tqdm -import copy -import pandas as pd -import numpy as np - -from src.display.formatting import has_no_nan_values, make_clickable_model -from src.display.utils import AutoEvalColumn, EvalQueueColumn -from src.leaderboard.filter_models import filter_models -from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_model_type_with_open_llm_request_file - -from src.backend.envs import Tasks as BackendTasks -from src.display.utils import Tasks -from src.display.utils import system_metrics_to_name_map, gpu_metrics_to_name_map - -def get_leaderboard_df( - results_path: str, - requests_path: str, - requests_path_open_llm: str, - cols: list, - benchmark_cols: list, - is_backend: bool = False, -) -> tuple[list[EvalResult], pd.DataFrame]: - # Returns a list of EvalResult - raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm) - if requests_path_open_llm != "": - for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"): - raw_data[result_idx] = update_model_type_with_open_llm_request_file( - raw_data[result_idx], requests_path_open_llm - ) - - # all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()] - all_data_json_ = [v.to_dict() for v in raw_data] # include incomplete evals - - name_to_bm_map = {} - - task_iterator = Tasks - if is_backend is True: - task_iterator = BackendTasks - - for task in task_iterator: - task = task.value - name = task.col_name - bm = (task.benchmark, task.metric) - name_to_bm_map[name] = bm - - - - all_data_json = [] - for entry in all_data_json_: - new_entry = copy.deepcopy(entry) - for k, v in entry.items(): - if k in name_to_bm_map: - benchmark, metric = name_to_bm_map[k] - new_entry[k] = entry[k][metric] - for sys_metric, metric_namne in system_metrics_to_name_map.items(): - if sys_metric in entry[k]: - new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric] - - for gpu_metric, metric_namne in gpu_metrics_to_name_map.items(): - if gpu_metric in entry[k]: - new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric] - all_data_json += [new_entry] - - # all_data_json.append(baseline_row) - filter_models(all_data_json) - - df = pd.DataFrame.from_records(all_data_json) - - # if AutoEvalColumn.average.name in df: - # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) - for col in cols: - if col not in df.columns: - df[col] = np.nan - - if not df.empty: - df = df.round(decimals=2) - - # filter out if any of the benchmarks have not been produced - # df = df[has_no_nan_values(df, benchmark_cols)] - - return raw_data, df - - -def get_evaluation_queue_df(save_path: str, cols: list) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] - all_evals = [] - - for entry in entries: - if ".json" in entry: - file_path = os.path.join(save_path, entry) - with open(file_path) as fp: - data = json.load(fp) - - data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) - data[EvalQueueColumn.revision.name] = data.get("revision", "main") - data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-") - - all_evals.append(data) - elif ".md" not in entry: - # this is a folder - sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")] - for sub_entry in sub_entries: - file_path = os.path.join(save_path, entry, sub_entry) - with open(file_path) as fp: - data = json.load(fp) - - data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) - data[EvalQueueColumn.revision.name] = data.get("revision", "main") - data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-") - all_evals.append(data) - - pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] - running_list = [e for e in all_evals if e["status"] == "RUNNING"] - finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] - df_pending = pd.DataFrame.from_records(pending_list, columns=cols) - df_running = pd.DataFrame.from_records(running_list, columns=cols) - df_finished = pd.DataFrame.from_records(finished_list, columns=cols) - return df_finished[cols], df_running[cols], df_pending[cols] diff --git a/open-moe-llm-leaderboard-gh/src/submission/check_validity.py b/open-moe-llm-leaderboard-gh/src/submission/check_validity.py deleted file mode 100644 index 3d2394d8a70621a2f8cf6e6b283e96aa8549cb0c..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/submission/check_validity.py +++ /dev/null @@ -1,142 +0,0 @@ -import json -import os -import re -from collections import defaultdict -from datetime import datetime, timedelta, timezone - -import huggingface_hub -from huggingface_hub import ModelCard -from huggingface_hub.hf_api import ModelInfo - -from transformers import AutoConfig, AutoTokenizer -from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config - -from src.envs import HAS_HIGHER_RATE_LIMIT - -from typing import Optional - - -# ht to @Wauplin, thank you for the snippet! -# See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317 -def check_model_card(repo_id: str) -> tuple[bool, str]: - # Returns operation status, and error message - try: - card = ModelCard.load(repo_id) - except huggingface_hub.utils.EntryNotFoundError: - return False, "Please add a model card to your model to explain how you trained/fine-tuned it." - - # Enforce license metadata - if card.data.license is None: - if not ("license_name" in card.data and "license_link" in card.data): - return False, ( - "License not found. Please add a license to your model card using the `license` metadata or a" - " `license_name`/`license_link` pair." - ) - - # Enforce card content - if len(card.text) < 200: - return False, "Please add a description to your model card, it is too short." - - return True, "" - - -def is_model_on_hub( - model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False -) -> tuple[bool, Optional[str], Optional[AutoConfig]]: - try: - config = AutoConfig.from_pretrained( - model_name, revision=revision, trust_remote_code=trust_remote_code, token=token - ) - if test_tokenizer: - try: - AutoTokenizer.from_pretrained( - model_name, revision=revision, trust_remote_code=trust_remote_code, token=token - ) - except ValueError as e: - return False, f"uses a tokenizer which is not in a transformers release: {e}", None - except Exception as e: - return ( - False, - "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", - None, - ) - return True, None, config - - except ValueError as e: - return ( - False, - "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", - None, - ) - - except Exception as e: - return False, f"was not found on hub -- {str(e)}", None - - -def get_model_size(model_info: ModelInfo, precision: str): - size_pattern = re.compile(r"(\d\.)?\d+(b|m)") - try: - model_size = round(model_info.safetensors["total"] / 1e9, 3) - except (AttributeError, TypeError): - try: - size_match = re.search(size_pattern, model_info.modelId.lower()) - model_size = size_match.group(0) - model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3) - except AttributeError: - return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py - - size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1 - model_size = size_factor * model_size - return model_size - - -def get_model_arch(model_info: ModelInfo): - return model_info.config.get("architectures", "Unknown") - - -def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota): - if org_or_user not in users_to_submission_dates: - return True, "" - submission_dates = sorted(users_to_submission_dates[org_or_user]) - - time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ") - submissions_after_timelimit = [d for d in submission_dates if d > time_limit] - - num_models_submitted_in_period = len(submissions_after_timelimit) - if org_or_user in HAS_HIGHER_RATE_LIMIT: - rate_limit_quota = 2 * rate_limit_quota - - if num_models_submitted_in_period > rate_limit_quota: - error_msg = f"Organisation or user `{org_or_user}`" - error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard " - error_msg += f"in the last {rate_limit_period} days.\n" - error_msg += ( - "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗" - ) - return False, error_msg - return True, "" - - -def already_submitted_models(requested_models_dir: str) -> set[str]: - depth = 1 - file_names = [] - users_to_submission_dates = defaultdict(list) - - for root, _, files in os.walk(requested_models_dir): - current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) - if current_depth == depth: - for file in files: - if not file.endswith(".json"): - continue - with open(os.path.join(root, file), "r") as f: - info = json.load(f) - if not info["status"] == "FINISHED" and not info["status"] == "RUNNING": - file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}") - - # Select organisation - if info["model"].count("/") == 0 or "submitted_time" not in info: - continue - organisation, _ = info["model"].split("/") - users_to_submission_dates[organisation].append(info["submitted_time"]) - - return set(file_names), users_to_submission_dates diff --git a/open-moe-llm-leaderboard-gh/src/submission/submit.py b/open-moe-llm-leaderboard-gh/src/submission/submit.py deleted file mode 100644 index d9b861ec95d2ce88642e0628b97319472aba8b9d..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/submission/submit.py +++ /dev/null @@ -1,148 +0,0 @@ -import json -import os -from datetime import datetime, timezone - -from src.display.formatting import styled_error, styled_message, styled_warning -from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, DEBUG_QUEUE_REPO -from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS -from src.submission.check_validity import ( - already_submitted_models, - check_model_card, - get_model_size, - is_model_on_hub, - user_submission_permission, -) - -REQUESTED_MODELS = None -USERS_TO_SUBMISSION_DATES = None - - -def add_new_eval( - model: str, - base_model: str, - revision: str, - precision: str, - private: bool, - weight_type: str, - model_type: str, - inference_framework: str, - debug: bool = False, - gpu_type: str = "NVIDIA-A100-PCIe-80GB", -): - global REQUESTED_MODELS - global USERS_TO_SUBMISSION_DATES - if not REQUESTED_MODELS: - REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) - - if debug: - QUEUE_REPO = DEBUG_QUEUE_REPO - - user_name = "" - model_path = model - if "/" in model: - user_name = model.split("/")[0] - model_path = model.split("/")[1] - - precision = precision.split(" ")[0] - current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - - if model_type is None or model_type == "": - return styled_error("Please select a model type.") - - # Is the user rate limited? - if user_name != "": - user_can_submit, error_msg = user_submission_permission( - user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA - ) - if not user_can_submit: - return styled_error(error_msg) - - # Did the model authors forbid its submission to the leaderboard? - if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS: - return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.") - - # Does the model actually exist? - if revision == "": - revision = "main" - - # Is the model on the hub? - if weight_type in ["Delta", "Adapter"]: - base_model_on_hub, error, _ = is_model_on_hub( - model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False - ) - if not base_model_on_hub: - return styled_error(f'Base model "{base_model}" {error}') - - if not weight_type == "Adapter": - model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=False) - if not model_on_hub: - return styled_error(f'Model "{model}" {error}') - - # Is the model info correctly filled? - try: - model_info = API.model_info(repo_id=model, revision=revision) - except Exception: - return styled_error("Could not get your model information. Please fill it up properly.") - - model_size = get_model_size(model_info=model_info, precision=precision) - - # Were the model card and license filled? - try: - license = model_info.cardData["license"] - except Exception: - return styled_error("Please select a license for your model") - - # TODO: Check if the inference framework is valid - - modelcard_OK, error_msg = check_model_card(model) - if not modelcard_OK: - return styled_error(error_msg) - - # Seems good, creating the eval - print("Adding new eval") - - eval_entry = { - "model": model, - "base_model": base_model, - "revision": revision, - "private": private, - "precision": precision, - "weight_type": weight_type, - "status": "PENDING", - "submitted_time": current_time, - "model_type": model_type, - "likes": model_info.likes, - "params": model_size, - "license": license, - "inference_framework": inference_framework, - "gpu_type": gpu_type - } - - # Check for duplicate submission - if f"{model}_{revision}_{precision}_{inference_framework}_{gpu_type}" in REQUESTED_MODELS: - return styled_warning("This model has been already submitted.") - - print("Creating eval file") - OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" - os.makedirs(OUT_DIR, exist_ok=True) - # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json" - out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}_{gpu_type}.json" - - with open(out_path, "w") as f: - f.write(json.dumps(eval_entry)) - - print("Uploading eval file") - API.upload_file( - path_or_fileobj=out_path, - path_in_repo=out_path.split("eval-queue/")[1], - repo_id=QUEUE_REPO, - repo_type="dataset", - commit_message=f"Add {model} to eval queue", - ) - - # Remove the local file - os.remove(out_path) - - return styled_message( - "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list." - ) diff --git a/open-moe-llm-leaderboard-gh/src/utils.py b/open-moe-llm-leaderboard-gh/src/utils.py deleted file mode 100644 index b6543d6e2f21ecf4c0d00efd3e20f909ad79eb02..0000000000000000000000000000000000000000 --- a/open-moe-llm-leaderboard-gh/src/utils.py +++ /dev/null @@ -1,248 +0,0 @@ -import pandas as pd -from huggingface_hub import snapshot_download -import subprocess -import re -import os -import GPUtil - -try: - from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name -except: - print("local debug: from display.utils") - from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name - -MEM_BW_DICT ={ - "NVIDIA-A100-PCIe-80GB": 1935, - "NVIDIA-A100-SXM-80GB": 2039, - "NVIDIA-H100-PCIe-80GB": 2039, - "NVIDIA-RTX-A5000-24GB": 768 -} - -PEAK_FLOPS_DICT = { - "float32":{ - "NVIDIA-A100-PCIe-80GB": 312e12, - "NVIDIA-A100-SXM-80GB": 312e12, - "NVIDIA-H100-PCIe-80GB": 756e12, - "NVIDIA-RTX-A5000-24GB": 222.2e12 - }, - "float16":{ - "NVIDIA-A100-PCIe-80GB": 624e12, - "NVIDIA-A100-SXM-80GB": 624e12, - "NVIDIA-H100-PCIe-80GB": 1513e12, - "NVIDIA-RTX-A5000-24GB": 444.4e12 - }, - "bfloat16":{ - "NVIDIA-A100-PCIe-80GB": 624e12, - "NVIDIA-A100-SXM-80GB": 624e12, - "NVIDIA-H100-PCIe-80GB": 1513e12, - "NVIDIA-RTX-A5000-24GB": 444.4e12 - }, - "8bit":{ - "NVIDIA-A100-PCIe-80GB": 1248e12, - "NVIDIA-A100-SXM-80GB": 1248e12, - "NVIDIA-H100-PCIe-80GB": 3026e12, - "NVIDIA-RTX-A5000-24GB": 889e12 - }, - "4bit": { - "NVIDIA-A100-PCIe-80GB": 2496e12, - "NVIDIA-A100-SXM-80GB": 2496e12, - "NVIDIA-H100-PCIe-80GB": 6052e12, - "NVIDIA-RTX-A5000-24GB": 1778e12 - } - -} - -def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers): - for i in range(10): - try: - snapshot_download( - repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers - ) - return - except Exception as e: - print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...") - import time - - time.sleep(60) - return - - -def get_dataset_url(row): - dataset_name = row["Benchmark"] - dataset_url = row["Dataset Link"] - benchmark = f'{dataset_name}' - return benchmark - - -def get_dataset_summary_table(file_path): - df = pd.read_csv(file_path) - - df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1) - - df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]] - - return df - -def parse_nvidia_smi(): - visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) - if visible_devices is not None: - gpu_indices = visible_devices.split(',') - else: - # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set - result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True) - if result.returncode != 0: - print("Failed to query GPU indices.") - return [] - gpu_indices = result.stdout.strip().split('\n') - # print(f"gpu_indices: {gpu_indices}") - gpu_stats = [] - - gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%') - # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)') - gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)') - - gpu_name = "" - for index in gpu_indices: - result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True) - output = result.stdout.strip() - lines = output.split("\n") - for line in lines: - match = gpu_info_pattern.search(line) - name_match = gpu_name_pattern.search(line) - gpu_info = {} - if name_match: - gpu_name = ''.join(filter(None, name_match.groups())).strip() - if match: - temp, power_usage, mem_usage, gpu_util = map(int, match.groups()) - gpu_info.update({ - GPU_TEMP: temp, - GPU_Power: power_usage, - GPU_Mem: round(mem_usage / 1024, 2), - GPU_Util: gpu_util - }) - - if len(gpu_info) >= 4: - gpu_stats.append(gpu_info) - # print(f"gpu_stats: {gpu_stats}") - gpu_name = f"{len(gpu_stats)}x{gpu_name}" - gpu_stats_total = { - GPU_TEMP: 0, - GPU_Power: 0, - GPU_Mem: 0, - GPU_Util: 0, - GPU_Name: gpu_name - } - for gpu_stat in gpu_stats: - gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP] - gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power] - gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem] - gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util] - gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G - gpu_stats_total[GPU_TEMP] /= len(gpu_stats) - gpu_stats_total[GPU_Power] /= len(gpu_stats) - gpu_stats_total[GPU_Util] /= len(gpu_stats) - return [gpu_stats_total] - -def monitor_gpus(stop_event, interval, stats_list): - while not stop_event.is_set(): - gpu_stats = parse_nvidia_smi() - if gpu_stats: - stats_list.extend(gpu_stats) - stop_event.wait(interval) - -def analyze_gpu_stats(stats_list): - # Check if the stats_list is empty, and return None if it is - if not stats_list: - return None - - # Initialize dictionaries to store the stats - avg_stats = {} - max_stats = {} - - # Calculate average stats, excluding 'GPU_Mem' - for key in stats_list[0].keys(): - if key != GPU_Mem and key != GPU_Name: - total = sum(d[key] for d in stats_list) - avg_stats[key] = total / len(stats_list) - - # Calculate max stats for 'GPU_Mem' - max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list) - if GPU_Name in stats_list[0]: - avg_stats[GPU_Name] = stats_list[0][GPU_Name] - # Update average stats with max GPU memory usage - avg_stats.update(max_stats) - - return avg_stats - -def get_gpu_number(): - visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) - if visible_devices is not None: - gpu_indices = visible_devices.split(',') - else: - # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set - result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True) - if result.returncode != 0: - print("Failed to query GPU indices.") - return [] - gpu_indices = result.stdout.strip().split('\n') - # print(f"gpu_indices: {gpu_indices}") - gpu_stats = [] - - gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%') - - for index in gpu_indices: - result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True) - output = result.stdout.strip() - lines = output.split("\n") - for line in lines: - match = gpu_info_pattern.search(line) - gpu_info = {} - if match: - temp, power_usage, mem_usage, gpu_util = map(int, match.groups()) - gpu_info.update({ - GPU_TEMP: temp, - GPU_Power: power_usage, - GPU_Mem: round(mem_usage / 1024, 2), - GPU_Util: gpu_util - }) - - if len(gpu_info) >= 4: - gpu_stats.append(gpu_info) - - return len(gpu_stats) - -def get_gpu_details(): - gpus = GPUtil.getGPUs() - gpu = gpus[0] - name = gpu.name.replace(" ", "-") - memory_gb = round(gpu.memoryTotal / 1024) - memory = f"{memory_gb}GB" - - for part in name.split('-'): - if part.endswith("GB") and part[:-2].isdigit(): - name = name.replace(f"-{part}", "").replace(part, "") - - formatted_name = f"{name}-{memory}" - - return formatted_name - -def get_peak_bw(gpu_name): - return MEM_BW_DICT[gpu_name] - -def get_peak_flops(gpu_name, precision): - return PEAK_FLOPS_DICT[precision][gpu_name] - -def transfer_precision2bytes(precision): - if precision == "float32": - return 4 - elif precision in ["float16", "bfloat16"]: - return 2 - elif precision == "8bit": - return 1 - elif precision == "4bit": - return 0.5 - else: - raise ValueError(f"Unsupported precision: {precision}") - -if __name__ == "__main__": - print(analyze_gpu_stats(parse_nvidia_smi())) diff --git a/src/backend/envs.py b/src/backend/envs.py index 212f4318e47b069ab02a93bbc8f957899632c1e7..258c2c901e87c3e1987b625669d156947ad81bfb 100644 --- a/src/backend/envs.py +++ b/src/backend/envs.py @@ -58,6 +58,7 @@ class Tasks(Enum): # task20 = Task("race", "acc", "RACE", 0) task21 = Task("mmlu", "acc", "MMLU", 5) task22 = Task("gsm8k_custom", "em", "GSM8K", 5) + task23 = Task("gsm8k_cot", "em", "GSM8K", 8) EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") diff --git a/src/backend/hflm_with_measurement.py b/src/backend/hflm_with_measurement.py index 5b2133fd0ed841a776c34dc3dbab8b6776524f7d..6833210668d60c01a019edbb44bb8ca4508fb03f 100644 --- a/src/backend/hflm_with_measurement.py +++ b/src/backend/hflm_with_measurement.py @@ -37,6 +37,9 @@ from lm_eval.models.utils import ( stop_sequences_criteria, ) from lm_eval.models.huggingface import HFLM +from src.utils import get_gpu_number, get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops +from src.submission.check_validity import get_model_size +from src.envs import API class StopWatch(TextStreamer): @@ -67,6 +70,9 @@ class StopWatch(TextStreamer): class HFLMWithMeasurement(HFLM): def __init__(self, **kwargs): super().__init__(**kwargs) + self.pretrained = kwargs.get("pretrained", None) + self.revision = kwargs.get("revision", None) + self.precision = kwargs.get("dtype", None) def _loglikelihood_tokens( self, @@ -279,7 +285,7 @@ class HFLMWithMeasurement(HFLM): # Answer: (log prob, is-exact-match) answer = (float(logits.sum()), bool(max_equal)) - res.append((answer, per_sample_time, 0, 0)) + res.append((answer, per_sample_time, 0, 0, 0, 0)) self.cache_hook.add_partial("loglikelihood", request_str, answer) pbar.update(1) @@ -288,7 +294,7 @@ class HFLMWithMeasurement(HFLM): return re_ord.get_original(res) - def _model_generate(self, context, max_length, stop, **generation_kwargs): + def _model_generate(self, context, max_tokens, stop, **generation_kwargs): # temperature = 0.0 if not set # if do_sample is false and temp==0.0: # remove temperature, as do_sample=False takes care of this @@ -296,7 +302,7 @@ class HFLMWithMeasurement(HFLM): generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0) do_sample = generation_kwargs.get("do_sample", None) - is_gsm8k = generation_kwargs.get("is_gsm8k", False) + # is_gsm8k = generation_kwargs.get("is_gsm8k", False) # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies if generation_kwargs.get("temperature") == 0.0 and do_sample is None: @@ -305,48 +311,133 @@ class HFLMWithMeasurement(HFLM): if do_sample is False and generation_kwargs.get("temperature") == 0.0: generation_kwargs.pop("temperature") - generation_kwargs.pop("is_gsm8k") + # if is_gsm8k: + # generation_kwargs.pop("is_gsm8k") + + context_length = context.shape[1] - if not is_gsm8k: - # build stopping criteria - stopping_criteria = stop_sequences_criteria( - self.tokenizer, stop, context.shape[1], context.shape[0] - ) - stop_watch = StopWatch(self.tokenizer) - start = time() - res = self.model.generate( - input_ids=context, - max_length=max_length, - stopping_criteria=stopping_criteria, - pad_token_id=self.tokenizer.pad_token_id, - use_cache=True, - streamer=stop_watch, - **generation_kwargs, - ) - end = time() + if self.model.__class__.__name__ == "MoE": + model_config = self.model.model.config else: - # print("Using GSM8K") - stop_watch = StopWatch(self.tokenizer) - start = time() - res = self.model.generate( - input_ids=context, - max_length=max_length, - eos_token_id=stop, - pad_token_id=self.tokenizer.pad_token_id, - use_cache=True, - streamer=stop_watch, - **generation_kwargs, - ) - end = time() + model_config = self.model.config + + if not self.precision: + if model_config.quantization_config._load_in_4bit: + self.precision = "4bit" + elif model_config.quantization_config._load_in_8bit: + self.precision = "8bit" + else: + raise ValueError("Unknown precision") + + # print(self.model) + linear_count = 0 + element_wise_mul = 0 + for name, module in self.model.named_modules(): + if ('layers.0.' in name or 'decoder.0.' in name) and ('attn' not in name): + if 'experts.0.' in name: + if isinstance(module, torch.nn.Linear): + # print(name, module) + linear_count += 1 + elif 'experts' not in name: + if "gate" not in name or "gate_proj" in name: + if "gate_proj" in name: + element_wise_mul = 1 + if isinstance(module, torch.nn.Linear): + # print(name, module) + linear_count += 1 + else: + continue + print(f"linear_count: {linear_count}") + + stopping_criteria = stop_sequences_criteria( + self.tokenizer, stop, context.shape[1], context.shape[0] + ) + stop_watch = StopWatch(self.tokenizer) + start = time() + res = self.model.generate( + input_ids=context, + max_new_tokens=max_tokens, + stopping_criteria=stopping_criteria, + pad_token_id=self.tokenizer.pad_token_id, + use_cache=True, + streamer=stop_watch, + **generation_kwargs, + ) + end = time() batch_size = context.shape[0] output_length = stop_watch.decoding_iterations + + precision_bytes = transfer_precision2bytes(self.precision) + + model_info = API.model_info(repo_id=self.pretrained, revision=self.revision) + model_size_param = get_model_size(model_info=model_info, precision=self.precision) + + n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers + d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model + + if hasattr(model_config, "num_experts_per_tok"): + n_experts_per_tok = model_config.num_experts_per_tok + elif hasattr(model_config, "num_selected_experts"): + n_experts_per_tok = model_config.num_selected_experts + else: + n_experts_per_tok = 1 + + if hasattr(model_config, "ffn_dim"): + d_ff = model_config.ffn_dim + elif hasattr(model_config, "intermediate_size"): + d_ff = model_config.intermediate_size + elif hasattr(model_config, "d_ff"): + d_ff = model_config.d_ff + else: + if hasattr(model_config, "ff_ratio"): + d_ff = d_model * model_config.ff_ratio + else: + raise ValueError("Unknown FFN dimension") + + if hasattr(model_config, "num_local_experts"): + num_experts = model_config.num_local_experts + elif hasattr(model_config, "num_experts"): + num_experts = model_config.num_experts + else: + num_experts = 1 + + ffn_params = n_layers * d_ff * linear_count * d_model + + shared_params = model_size_param * 1e9 - num_experts * ffn_params + + model_size = shared_params + n_experts_per_tok * ffn_params + + per_token_kv_size = 2 * n_layers * d_model * precision_bytes + + peak_bw_single = get_peak_bw(get_gpu_details()) + peak_bw = peak_bw_single * get_gpu_number() + + context_prefill_size = context_length + kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2 + + kv_size = kv_size / 1e9 + + n_vocab = model_config.vocab_size end_to_end_time = (end - start) / batch_size prefilling_time = stop_watch.prefilling_time / batch_size decoding_time = stop_watch.decoding_time / batch_size token_per_sec = output_length / decoding_time - return res, end_to_end_time, prefilling_time, token_per_sec + achieve_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec + + avg_context_length = context_length + (output_length - 1) / 2 + flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab + peak_flops_single = get_peak_flops(get_gpu_details(), self.precision) + peak_flops = peak_flops_single * get_gpu_number() + + ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial + mfu = token_per_sec * flops_per_token / peak_flops + mbu = achieve_mem_bw / peak_bw + + print(f"mfu: {mfu}, mbu: {mbu}") + + return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu def generate_until( self, requests: List[Instance], disable_tqdm: bool = False @@ -423,15 +514,18 @@ class HFLMWithMeasurement(HFLM): f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}" ) # add EOS token to stop sequences - eos = self.tok_decode(self.eot_token_id) + eos = "<|eot_id|>" if not until: until = [eos] else: until.append(eos) - is_gsm8k = kwargs.get("is_gsm8k", False) - if is_gsm8k: - until = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|eot_id|>")] + # is_gsm8k = kwargs.get("is_gsm8k", False) + # if is_gsm8k: + # until = ["Question:", "Question", ""] + # eos_ids = [self.tokenizer.eos_token_id, + # self.tokenizer.convert_tokens_to_ids("<|eot_id|>")] + if "max_gen_toks" in kwargs.keys(): max_gen_toks = kwargs.pop("max_gen_toks") @@ -457,11 +551,11 @@ class HFLMWithMeasurement(HFLM): context_enc = context_enc.to(self.device) attn_masks = attn_masks.to(self.device) - if "max_length" not in kwargs: - kwargs["max_length"] = context_enc.shape[1] + max_gen_toks + if "max_tokens" not in kwargs: + kwargs["max_tokens"] = max_gen_toks # perform batched generation - cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate( + cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate( context=context_enc, attention_mask=attn_masks, stop=until, @@ -477,15 +571,16 @@ class HFLMWithMeasurement(HFLM): s = self.tok_decode(cont_toks) - # use secondary stop seqs to cut off should-have-been-stopped content post-hoc - if not is_gsm8k: - for term in until: - if len(term) > 0: - # ignore '' separator, - # for seq2seq case where self.tok_decode(self.eot_token_id) = '' - s = s.split(term)[0] - - res.append((s, end_to_end_time, prefilling_time, token_per_sec)) + # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc + # if not is_gsm8k: + for term in until: + if len(term) > 0: + # ignore '' separator, + # for seq2seq case where self.tok_decode(self.eot_token_id) = '' + s = s.split(term)[0] + + # print(s) + res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu)) self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s) pbar.update(1) diff --git a/src/backend/moe_infinity.py b/src/backend/moe_infinity.py index 76851df2501b1b17006d29987752fe9bd8dcb381..a3c676549b8cbd1d374d282bf56cfcca68548a76 100644 --- a/src/backend/moe_infinity.py +++ b/src/backend/moe_infinity.py @@ -31,8 +31,9 @@ class MoEHFLM(HFLMWithMeasurement): self.use_chat_template = use_chat_template if "device" in kwargs: kwargs.pop("device") + kwargs["device_map"] = "cuda:0" super().__init__( - *args, **kwargs, pretrained=pretrained, device_map="cuda:0" + *args, **kwargs, pretrained=pretrained ) # Assuming HFLM accepts a 'pretrained' arg and handles it # self._create_model() shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) diff --git a/src/backend/run_eval_suite.py b/src/backend/run_eval_suite.py index b175bbcc01bb81f00beb67c31d16b142ffe1d26c..390c6292eac93532fa5f3115e73fd223df59fc73 100644 --- a/src/backend/run_eval_suite.py +++ b/src/backend/run_eval_suite.py @@ -17,12 +17,16 @@ def process_results_decorator(func): end_to_end_time = sum([r[1] for r in results]) / len(results) prefilling_time = sum([r[2] for r in results]) / len(results) decoding_throughput = sum([r[3] for r in results]) / len(results) + mfu = sum([r[4] for r in results]) / len(results) + mbu = sum([r[5] for r in results]) / len(results) # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}") result_dict = func(self, doc, processed_results, *args, **kwargs) result_dict["end_to_end_time"] = end_to_end_time result_dict["prefilling_time"] = prefilling_time result_dict["decoding_throughput"] = decoding_throughput + result_dict["mfu"] = mfu * 100 + result_dict["mbu"] = mbu * 100 return result_dict return wrapper ConfigurableTask.process_results = process_results_decorator(orig_process_results) @@ -33,6 +37,8 @@ def aggregation_decorator(func): aggregation_list["end_to_end_time"] = mean aggregation_list["prefilling_time"] = mean aggregation_list["decoding_throughput"] = mean + aggregation_list["mfu"] = mean + aggregation_list["mbu"] = mean return aggregation_list return wrapper ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation) @@ -43,6 +49,8 @@ def higher_is_better_decorator(func): higher_is_better_dict["end_to_end_time"] = False higher_is_better_dict["prefilling_time"] = False higher_is_better_dict["decoding_throughput"] = True + higher_is_better_dict["mfu"] = True + higher_is_better_dict["mbu"] = True return higher_is_better_dict return wrapper ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better) diff --git a/src/backend/tasks/gsm8k/gsm8k-custom.yaml b/src/backend/tasks/gsm8k/gsm8k-custom.yaml index 25f32ec81b9d7446cba994e7e980de7e462a3e46..50c537b9cbd7dab62319dd2995f6334320c0f32e 100644 --- a/src/backend/tasks/gsm8k/gsm8k-custom.yaml +++ b/src/backend/tasks/gsm8k/gsm8k-custom.yaml @@ -22,18 +22,21 @@ metric_list: - "\\.$" generation_kwargs: until: - - "<|eot_id|>" + - "Question:" + - "Question" + - "" + - "<|im_end|>" do_sample: false temperature: 0.0 - is_gsm8k: true + # is_gsm8k: true repeats: 1 num_fewshot: 5 filter_list: - # - name: "strict-match" - # filter: - # - function: "regex" - # regex_pattern: "#### (\\-?[0-9\\.\\,]+)" - # - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" - name: "flexible-extract" filter: - function: "regex" diff --git a/src/backend/tasks/measurement_task_utils.py b/src/backend/tasks/measurement_task_utils.py index 18b81a03e47cf51acd16a2ca0532dbed5558192c..9cf96db5f3291ec148dc8c8ebfa5a1a51316b416 100644 --- a/src/backend/tasks/measurement_task_utils.py +++ b/src/backend/tasks/measurement_task_utils.py @@ -12,6 +12,9 @@ def process_results_decorator(func): end_to_end_time = sum([r[1] for r in results]) / len(results) prefilling_time = sum([r[2] for r in results]) / len(results) decoding_throughput = sum([r[3] for r in results]) / len(results) + mfu = sum([r[4] for r in results]) / len(results) + mbu = sum([r[5] for r in results]) / len(results) + # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}") # Now call the original process_results with the processed results @@ -19,6 +22,8 @@ def process_results_decorator(func): result_dict["end_to_end_time"] = end_to_end_time result_dict["prefilling_time"] = prefilling_time result_dict["decoding_throughput"] = decoding_throughput + result_dict["mfu"] = mfu + result_dict["mbu"] = mbu return result_dict return wrapper @@ -30,6 +35,8 @@ def aggregation_decorator(func): aggregation_list["end_to_end_time"] = mean aggregation_list["prefilling_time"] = mean aggregation_list["decoding_throughput"] = mean + aggregation_list["mfu"] = mean + aggregation_list["mbu"] = mean return aggregation_list return wrapper @@ -41,6 +48,8 @@ def higher_is_better_decorator(func): higher_is_better_dict["end_to_end_time"] = False higher_is_better_dict["prefilling_time"] = False higher_is_better_dict["decoding_throughput"] = True + higher_is_better_dict["mfu"] = True + higher_is_better_dict["mbu"] = True return higher_is_better_dict return wrapper diff --git a/src/display/utils.py b/src/display/utils.py index 98188b5b94fbae0ba6f856711cdbb42e3b2e821c..2dc5c094370da143b544a76c71079b690ed86ebf 100644 --- a/src/display/utils.py +++ b/src/display/utils.py @@ -18,12 +18,16 @@ GPU_Power = 'Power(W)' GPU_Mem = 'Mem(G)' GPU_Name = "GPU" GPU_Util = 'Util(%)' +MFU = 'MFU(%)' +MBU = 'MBU(%)' BATCH_SIZE = 'bs' PRECISION = "Precision" system_metrics_to_name_map = { "end_to_end_time": f"{E2Es}", "prefilling_time": f"{PREs}", "decoding_throughput": f"{TS}", + "mfu": f"{MFU}", + "mbu": f"{MBU}" } gpu_metrics_to_name_map = { @@ -34,6 +38,8 @@ gpu_metrics_to_name_map = { "batch_size": BATCH_SIZE, "precision": PRECISION, GPU_Name: GPU_Name, + MFU: MFU, + MBU: MBU } @dataclass @@ -75,7 +81,8 @@ class Tasks(Enum): # # XXX include me back at some point selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT") mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot) - gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (8-shot) + gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot) + gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot) # These classes are for user facing column names, @@ -115,6 +122,8 @@ for task in Tasks: continue # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)]) auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)]) + auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)]) # Model information diff --git a/src/submission/check_validity.py b/src/submission/check_validity.py index 9c64c8e470460e5e00ba28219d8ff3b0de4ffdf0..3d2394d8a70621a2f8cf6e6b283e96aa8549cb0c 100644 --- a/src/submission/check_validity.py +++ b/src/submission/check_validity.py @@ -74,7 +74,7 @@ def is_model_on_hub( def get_model_size(model_info: ModelInfo, precision: str): - size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)") + size_pattern = re.compile(r"(\d\.)?\d+(b|m)") try: model_size = round(model_info.safetensors["total"] / 1e9, 3) except (AttributeError, TypeError):