Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

chivier commited on Jun 10

Commit

fe8e6f7

•

1 Parent(s): 0122892

sync from github

Browse files

Files changed (16) hide show

requirements.txt +4 -2
src/backend/envs.py +1 -0
src/backend/hflm_with_measurement.py +3 -3
src/backend/run_eval_suite.py +2 -2
src/backend/tasks/arena_hard/__init__.py +0 -0
src/backend/tasks/arena_hard/arena_hard.yaml +2 -0
src/backend/tasks/arena_hard/arena_judgment.py +256 -0
src/backend/tasks/arena_hard/arena_utils.py +349 -0
src/backend/tasks/arena_hard/configs/api_config.yaml +17 -0
src/backend/tasks/arena_hard/configs/judge_config.yaml +26 -0
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl +0 -0
src/backend/tasks/arena_hard/question.jsonl +0 -0
src/backend/tasks/arena_hard/task.py +220 -0
src/backend/tasks/selfcheckgpt/task.py +2 -2
src/display/utils.py +11 -10
src/leaderboard/read_evals.py +5 -2

requirements.txt CHANGED Viewed

@@ -16,7 +16,7 @@ requests
 semantic-version
 tqdm
 wandb
-transformers>=4.36.0
 tokenizers>=0.15.0
 lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
 accelerate
@@ -31,4 +31,6 @@ spacy==3.7.4
 selfcheckgpt
 immutabledict
 gputil
-bitsandbytes

 semantic-version
 tqdm
 wandb
+transformers
 tokenizers>=0.15.0
 lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
 accelerate
 selfcheckgpt
 immutabledict
 gputil
+bitsandbytes
+openai
+scikit-learn

src/backend/envs.py CHANGED Viewed

@@ -59,6 +59,7 @@ class Tasks(Enum):
     task21 = Task("mmlu", "acc", "MMLU", 5)
     task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
     # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")

     task21 = Task("mmlu", "acc", "MMLU", 5)
     task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
     # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
+    task24 = Task("arena_hard", "score", "Arena Hard", 0)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -354,6 +354,7 @@ class HFLMWithMeasurement(HFLM):
                         linear_count += 1
                     elif isinstance(module, DbrxExpertGLU):
                         linear_count = 3
                 # elif 'experts' not in name:
                 #     if ("gate" not in name and "router" not in name) or "gate_proj" in name:
                 #         if "gate_proj" in name:
@@ -388,8 +389,7 @@ class HFLMWithMeasurement(HFLM):
         precision_bytes = transfer_precision2bytes(self.precision)
-        model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
-        model_size_param = get_model_size(model_info=model_info, precision=self.precision)
         n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
             (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
@@ -429,7 +429,7 @@ class HFLMWithMeasurement(HFLM):
         ffn_params = n_layers * d_ff * linear_count * d_model
-        shared_params = model_size_param * 1e9 - num_experts * ffn_params
         model_size = shared_params + n_experts_per_tok * ffn_params

                         linear_count += 1
                     elif isinstance(module, DbrxExpertGLU):
                         linear_count = 3
+                        element_wise_mul = 1
                 # elif 'experts' not in name:
                 #     if ("gate" not in name and "router" not in name) or "gate_proj" in name:
                 #         if "gate_proj" in name:
         precision_bytes = transfer_precision2bytes(self.precision)
+        model_size_param = sum(p.numel() for p in self.model.parameters())
         n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
             (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
         ffn_params = n_layers * d_ff * linear_count * d_model
+        shared_params = model_size_param - num_experts * ffn_params
         model_size = shared_params + n_experts_per_tok * ffn_params

src/backend/run_eval_suite.py CHANGED Viewed

@@ -25,8 +25,8 @@ def process_results_decorator(func):
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
-        result_dict["mfu"] = mfu * 100
-        result_dict["mbu"] = mbu * 100
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)

         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
+        result_dict["mfu"] = mfu
+        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)

src/backend/tasks/arena_hard/__init__.py ADDED Viewed

File without changes

src/backend/tasks/arena_hard/arena_hard.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ task: arena_hard
2	+ class: !function task.ArenaHard

src/backend/tasks/arena_hard/arena_judgment.py ADDED Viewed

	@@ -0,0 +1,256 @@

+'''
+This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
+under the Apache 2.0 License from the arena-hard project.
+(https://github.com/lm-sys/arena-hard)
+Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
+See the NOTICE file distributed with this work for additional
+information regarding copyright ownership.
+'''
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import math
+from collections import defaultdict
+from tqdm import tqdm
+from src.backend.tasks.arena_hard.arena_utils import (
+    chat_completion_openai,
+    load_questions,
+    load_model_answers,
+    get_endpoint,
+    make_config,
+)
+def get_score(judgment, pattern, pairwise=True):
+    matches = pattern.findall(judgment)
+    matches = [m for m in matches if m != ""]
+    if len(set(matches)) == 0:
+        return None, True
+    elif len(set(matches)) == 1:
+        if pairwise:
+            return matches[0].strip("\n"), False
+        return int(matches[0])
+    else:
+        return None, False
+# get answer from model
+def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
+    api_dict = get_endpoint(endpoint_dict["endpoints"])
+    # if endpoint_dict["api_type"] == "anthropic":
+    #     output = chat_completion_anthropic(model, conv, temperature, max_tokens)
+    # elif endpoint_dict["api_type"] == "azure":
+    #     output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
+    output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
+    return output
+def judgment(**args):
+    question = args["question"]
+    answer = args["answer"]
+    reference = args["reference"]
+    baseline = args["baseline_answer"]
+    configs = args["configs"]
+    # output_file = args["output_file"]
+    model = configs["judge_model"]
+    num_games = 2 if configs["pairwise"] else 1
+    # output = {
+    #     "question_id":question["question_id"],
+    #     "judge": model,
+    #     "model": "custom_model",
+    #     "games":[]
+    #     }
+    output = [question["question_id"]]
+    for game in range(num_games):
+        conv = [{"role": "system", "content": configs["system_prompt"]}]
+        for template in configs["prompt_template"]:
+            prompt_args = {}
+            prompt_args[f"question_{1}"] = question["content"]
+            base = 1
+            if baseline:
+                if game % 2 == 1: # swap position
+                    temp = baseline
+                    baseline = answer
+                    answer = temp
+                if game == 0:
+                    for i, turn in enumerate(baseline["choices"][0]["turns"]):
+                        prompt_args[f"answer_{i+1}"] = turn["content"]
+                        base += 1
+                if game == 1:
+                    prompt_args[f"answer_{1}"] = baseline
+                    base += 1
+            if answer:
+                prompt_args[f"answer_{base}"] = answer
+            if reference:
+                for j, ref_answer in enumerate(reference):
+                    for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
+                        prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
+            user_prompt = template.format(**prompt_args)
+            conv.append({"role": "user", "content": user_prompt})
+        judgment = ""
+        for _ in range(2):
+            new_judgment = get_answer(
+                model,
+                conv,
+                configs["temperature"],
+                configs["max_tokens"],
+                args["endpoint_dict"],
+            )
+            judgment += ("\n" + new_judgment)
+            score, try_again = get_score(judgment, args["regex_pattern"])
+            conv.append({"role": "assistant", "content": new_judgment})
+            if not try_again:
+                break
+            conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
+        print("Finish judgment!!!")
+        # result = {
+        #     "user_prompt": conv[1]["content"],
+        #     "judgment": judgment,
+        #     "score":score
+        # }
+        output.append(score)
+    return output
+def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
+    arena_hard_battles = pd.DataFrame()
+    print("Turning score list into battles...")
+    for scores in tqdm(score_list):
+        question_id, score1, score2 = scores
+        # Process game 1
+        output = {"question_id": question_id,
+                  "model_a": "gpt-4-0314",
+                  "model_b": f"custom_model"}  # Unique identifier for model
+        weight = 1
+        if score1 == "A=B":
+            output["winner"] = "tie"
+        elif score1 == "A>B":
+            output["winner"] = "model_a"
+        elif score1 == "A>>B":
+            output["winner"] = "model_a"
+            weight = WEIGHT
+        elif score1 == "B>A":
+            output["winner"] = "model_b"
+        elif score1 == "B>>A":
+            output["winner"] = "model_b"
+            weight = WEIGHT
+        else:
+            weight = 0
+        if weight:
+            arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
+        if not first_game_only:
+            # Process game 2
+            output = {"question_id": question_id,
+                      "model_a": "gpt-4-0314",
+                      "model_b": f"custom_model"}  # Unique identifier for model
+            weight = 1
+            if score2 == "A=B":
+                output["winner"] = "tie"
+            elif score2 == "A>B":
+                output["winner"] = "model_b"
+            elif score2 == "A>>B":
+                output["winner"] = "model_b"
+                weight = WEIGHT
+            elif score2 == "B>A":
+                output["winner"] = "model_a"
+            elif score2 == "B>>A":
+                output["winner"] = "model_a"
+                weight = WEIGHT
+            else:
+                weight = 0
+            if weight:
+                arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
+    arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
+    return arena_hard_battles
+def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
+    models = pd.concat([df["model_a"], df["model_b"]]).unique()
+    models = pd.Series(np.arange(len(models)), index=models)
+    LOW_RATING = 100
+    # duplicate battles
+    df = pd.concat([df, df], ignore_index=True)
+    p = len(models.index)
+    n = df.shape[0]
+    X = np.zeros([n, p])
+    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
+    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
+    # one A win => two A win
+    Y = np.zeros(n)
+    Y[df["winner"] == "model_a"] = 1.0
+    # one tie => one A win + one B win
+    # find tie + tie (both bad) index
+    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
+    tie_idx[len(tie_idx)//2:] = False
+    Y[tie_idx] = 1.0
+    if len(np.unique(Y)) == 1:
+        # If there's only one class in the data, assign default ratings
+        elo_scores = np.full(p, LOW_RATING)
+        elo_scores[models["gpt-4-0314"]] = INIT_RATING
+    else:
+        lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
+        lr.fit(X,Y)
+        elo_scores = SCALE * lr.coef_[0] + INIT_RATING
+    # set anchor as gpt-4-0314 = 1000
+    if "gpt-4-0314" in models.index:
+        elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
+    return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
+def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
+    names = sorted(list(elo_ratings.keys()))
+    wins = defaultdict(lambda: defaultdict(lambda: 0))
+    for a in names:
+        for b in names:
+            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
+            wins[a][b] = ea
+            wins[b][a] = 1 - ea
+    data = {
+        a: [wins[a][b] if a != b else np.NAN for b in names]
+        for a in names
+    }
+    df = pd.DataFrame(data, index=names)
+    df.index.name = "model_a"
+    df.columns.name = "model_b"
+    return df.T
+def get_win_rate_column(df, column, baseline="gpt-4-0314"):
+    to_dict = df[["model", column]].set_index("model").to_dict()[column]
+    win_rate_table = predict_win_rate(to_dict)
+    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))

src/backend/tasks/arena_hard/arena_utils.py ADDED Viewed

	@@ -0,0 +1,349 @@

+'''
+This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
+under the Apache 2.0 License from the arena-hard project.
+(https://github.com/lm-sys/arena-hard)
+Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
+See the NOTICE file distributed with this work for additional
+information regarding copyright ownership.
+'''
+import os
+import json
+import time
+import yaml
+import random
+from typing import Optional
+from glob import glob
+# API setting constants
+API_MAX_RETRY = 16
+API_RETRY_SLEEP = 10
+API_ERROR_OUTPUT = "$ERROR$"
+OPENAI_MODEL_LIST = (
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0301",
+    "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-0613-verbose",
+    "gpt-3.5-turbo-1106",
+    "gpt-3.5-turbo-0125",
+    "gpt-4",
+    "gpt-4-0314",
+    "gpt-4-0613",
+    "gpt-4-turbo",
+    "gpt-4-1106-preview",
+    "gpt-4-0125-preview",
+)
+temperature_config = {
+    "writing": 0.7,
+    "roleplay": 0.7,
+    "extraction": 0.0,
+    "math": 0.0,
+    "coding": 0.0,
+    "reasoning": 0.0,
+    "stem": 0.1,
+    "humanities": 0.1,
+}
+def load_questions(question_file: str):
+    """Load questions from a file."""
+    questions = []
+    with open(question_file, "r") as ques_file:
+        for line in ques_file:
+            if line:
+                questions.append(json.loads(line))
+    return questions
+def load_model_answers(answer_dir: str):
+    """Load model answers.
+    The return value is a python dict of type:
+    Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
+    """
+    filenames = glob(os.path.join(answer_dir, "*.jsonl"))
+    filenames.sort()
+    model_answers = {}
+    for filename in filenames:
+        model_name = os.path.basename(filename)[:-6]
+        answer = {}
+        with open(filename) as fin:
+            for line in fin:
+                line = json.loads(line)
+                answer[line["question_id"]] = line
+        model_answers[model_name] = answer
+    return model_answers
+def get_endpoint(endpoint_list):
+    if endpoint_list is None:
+        return None
+    assert endpoint_list is not None
+    # randomly pick one
+    api_dict = random.choices(
+        endpoint_list
+    )[0]
+    return api_dict
+# load config args from config yaml files
+def make_config(config_file: str) -> dict:
+    config_kwargs = {}
+    with open(config_file, "r") as f:
+        config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
+    return config_kwargs
+def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
+    import openai
+    if api_dict:
+        client = openai.OpenAI(
+            base_url=api_dict["api_base"],
+            api_key=api_dict["api_key"],
+        )
+    else:
+        client = openai.OpenAI()
+    output = API_ERROR_OUTPUT
+    for _ in range(API_MAX_RETRY):
+        try:
+            # print(messages)
+            completion = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens
+                )
+            output = completion.choices[0].message.content
+            break
+        except openai.RateLimitError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+        except openai.BadRequestError as e:
+            print(messages)
+            print(type(e), e)
+        except KeyError:
+            print(type(e), e)
+            break
+    return output
+# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
+#     import openai
+#     from openai import AzureOpenAI
+#     api_base = api_dict["api_base"]
+#     client = AzureOpenAI(
+#         azure_endpoint = api_base,
+#         api_key= api_dict["api_key"],
+#         api_version=api_dict["api_version"],
+#         timeout=240,
+#         max_retries=2
+#     )
+#     output = API_ERROR_OUTPUT
+#     for _ in range(API_MAX_RETRY):
+#         try:
+#             response = client.chat.completions.create(
+#                 model=model,
+#                 messages=messages,
+#                 n=1,
+#                 temperature=temperature,
+#                 max_tokens=max_tokens,
+#                 seed=42,
+#             )
+#             output = response.choices[0].message.content
+#             break
+#         except openai.RateLimitError as e:
+#             print(type(e), e)
+#             time.sleep(API_RETRY_SLEEP)
+#         except openai.BadRequestError as e:
+#             print(type(e), e)
+#             break
+#         except KeyError:
+#             print(type(e), e)
+#             break
+#     return output
+# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
+#     import anthropic
+#     if api_dict:
+#         api_key = api_dict["api_key"]
+#     else:
+#         api_key = os.environ["ANTHROPIC_API_KEY"]
+#     sys_msg = ""
+#     if messages[0]["role"] == "system":
+#         sys_msg = messages[0]["content"]
+#         messages = messages[1:]
+#     output = API_ERROR_OUTPUT
+#     for _ in range(API_MAX_RETRY):
+#         try:
+#             # print(sys_msg)
+#             c = anthropic.Anthropic(api_key=api_key)
+#             response = c.messages.create(
+#                 model=model,
+#                 messages=messages,
+#                 stop_sequences=[anthropic.HUMAN_PROMPT],
+#                 max_tokens=max_tokens,
+#                 temperature=temperature,
+#                 system=sys_msg
+#             )
+#             output = response.content[0].text
+#             break
+#         except anthropic.APIError as e:
+#             print(type(e), e)
+#             time.sleep(API_RETRY_SLEEP)
+#     return output
+# def chat_completion_mistral(model, messages, temperature, max_tokens):
+#     from mistralai.client import MistralClient
+#     from mistralai.models.chat_completion import ChatMessage
+#     from mistralai.exceptions import MistralException
+#     api_key = os.environ["MISTRAL_API_KEY"]
+#     client = MistralClient(api_key=api_key)
+#     prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
+#     output = API_ERROR_OUTPUT
+#     for _ in range(API_MAX_RETRY):
+#         try:
+#             chat_response = client.chat(
+#                 model=model,
+#                 messages=prompts,
+#                 temperature=temperature,
+#                 max_tokens=max_tokens,
+#             )
+#             output = chat_response.choices[0].message.content
+#             break
+#         except MistralException as e:
+#             print(type(e), e)
+#             break
+#     return output
+# def chat_completion_gemini(model, messages, temperature, max_tokens):
+#     import google.generativeai as genai
+#     genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+#     safety_settings = [
+#         {
+#             "category": "HARM_CATEGORY_HARASSMENT",
+#             "threshold": "BLOCK_NONE"
+#         },
+#         {
+#             "category": "HARM_CATEGORY_HATE_SPEECH",
+#             "threshold": "BLOCK_NONE"
+#         },
+#         {
+#             "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+#             "threshold": "BLOCK_NONE"
+#         },
+#         {
+#             "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+#             "threshold": "BLOCK_NONE"
+#         },
+#     ]
+#     # Set up the model
+#     generation_config = {
+#         "temperature": temperature,
+#         "top_p": 1,
+#         "top_k": 1,
+#         "max_output_tokens": max_tokens,
+#     }
+#     output = API_ERROR_OUTPUT
+#     for _ in range(API_MAX_RETRY):
+#         try:
+#             gemini = genai.GenerativeModel(
+#                 model_name=model,
+#                 generation_config=generation_config,
+#                 safety_settings=safety_settings)
+#             convo = gemini.start_chat(history=[])
+#             convo.send_message(messages)
+#             output = convo.last.text
+#             break
+#         except genai.types.generation_types.StopCandidateException as e:
+#             print(type(e), e)
+#             break
+#         except Exception as e:
+#             print(type(e), e)
+#             time.sleep(API_RETRY_SLEEP)
+#     return output
+# def chat_completion_cohere(model, messages, temperature, max_tokens):
+#     import cohere
+#     co = cohere.Client(os.environ["COHERE_API_KEY"])
+#     assert len(messages) > 0
+#     template_map = {"system":"SYSTEM",
+#                     "assistant":"CHATBOT",
+#                     "user":"USER"}
+#     assert messages[-1]["role"] == "user"
+#     prompt = messages[-1]["content"]
+#     if len(messages) > 1:
+#         history = []
+#         for message in messages[:-1]:
+#             history.append({"role":template_map[message["role"]], "message":message["content"]})
+#     else:
+#         history = None
+#     output = API_ERROR_OUTPUT
+#     for _ in range(API_MAX_RETRY):
+#         try:
+#             response = co.chat(
+#                 message=prompt,
+#                 model=model,
+#                 temperature=temperature,
+#                 max_tokens=max_tokens,
+#                 chat_history=history,
+#             )
+#             output = response.text
+#             break
+#         except cohere.core.api_error.ApiError as e:
+#             print(type(e), e)
+#             raise
+#         except Exception as e:
+#             print(type(e), e)
+#             break
+#     return output
+def reorg_answer_file(answer_file):
+    """Sort by question id and de-duplication"""
+    answers = {}
+    with open(answer_file, "r") as fin:
+        for l in fin:
+            qid = json.loads(l)["question_id"]
+            answers[qid] = l
+    qids = sorted(list(answers.keys()))
+    with open(answer_file, "w") as fout:
+        for qid in qids:
+            fout.write(answers[qid])

src/backend/tasks/arena_hard/configs/api_config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# gpt-3.5-turbo:
+#     model_name: gpt-3.5-turbo
+#     endpoints: null
+#     api_type: openai
+#     parallel: 8
+gpt-4-1106-preview:
+    model_name: gpt-4-1106-preview
+    endpoints: null
+    api_type: openai
+    parallel: 8
+# llama3-7b:
+#     model_name: llama3-7b
+#     endpoints: null
+#     api_type: openai
+#     parallel: 8

src/backend/tasks/arena_hard/configs/judge_config.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: judgment config file for Arena Hard
+bench_name: arena-hard-v0.1
+# Arena Hard default
+judge_model: gpt-4-1106-preview
+# judge_model: gpt-3.5-turbo
+reference: False # Optional
+ref_model: null
+baseline: True
+baseline_model: gpt-4-0314
+pairwise: True
+temperature: 0
+max_tokens: 4096
+regex_pattern: \[\[([AB<>=]+)\]\]
+system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
+prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
+# Add your model below for evaluation
+# model_list:
+#   - gpt-3.5-turbo-0125

src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

src/backend/tasks/arena_hard/question.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

src/backend/tasks/arena_hard/task.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+from typing import Union, List
+from lm_eval.api.task import ConfigurableTask
+from lm_eval.api.instance import Instance
+# from lm_eval.api.registry import register_task
+from lm_eval.api.metrics import mean
+from src.backend.envs import DEVICE
+import pandas as pd
+from src.backend.tasks.measurement_task_utils import measure_system_metrics
+import json
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Union,
+)
+from datasets import Dataset
+import re
+from src.backend.tasks.arena_hard.arena_utils import (
+    load_questions,
+    load_questions,
+    load_model_answers,
+    make_config,
+)
+from src.backend.tasks.arena_hard.arena_judgment import (
+    judgment,
+    get_battles_from_scores,
+    compute_mle_elo,
+    predict_win_rate,
+    get_win_rate_column
+)
+def load_questions(question_file: str):
+    """Load questions from a file."""
+    questions = []
+    with open(question_file, "r") as ques_file:
+        for line in ques_file:
+            if line:
+                questions.append(json.loads(line))
+    return questions
+def download_wrapper(func):
+    def download(self, *args, **kwargs):
+        print("Using Arena Hard, No need to download")
+    return download
+original_download = ConfigurableTask.download
+ConfigurableTask.download = download_wrapper(original_download)
+# @register_task("selfcheckgpt")
+@measure_system_metrics
+class ArenaHard(ConfigurableTask):
+    VERSION = 0.0
+    OUTPUT_TYPE = "generate_until"
+    data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
+    judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
+    configs = make_config(judge_config_path)
+    model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
+    model_answers = load_model_answers(model_ans_dir)
+    data = load_questions(data_path)
+    def __init__(self):
+        super().__init__(config={"metadata": {"version": self.VERSION}})
+        # these end tokens are hard coded because of the current limitaion of the llm-eval.
+        # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
+        self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_length": 4096}
+        # self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
+        # self.generation_kwargs_sampling = {
+        #     "temperature": 0.99,
+        #     "do_sample": True,
+        #     "until": ["<im_end>", "<im_end>"],
+        #     "max_length": 1024,
+        # }
+    def transform_data(self, data):
+        transformed_data = []
+        for i in range(len(data)):
+            if self.configs["baseline"]:
+                baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
+            else:
+                baseline_answer = None
+            transformed_item = {
+                "question_id": data[i]["question_id"],
+                "content": data[i]["turns"][0]["content"],  # Assuming you want the first turn's content
+                "model_answer": baseline_answer
+            }
+            transformed_data.append(transformed_item)
+        return transformed_data
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def validation_docs(self):
+        self.dataset = self.transform_data(self.data)
+        self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
+                             "content": [item["content"] for item in self.dataset],
+                             "model_answer": [item["model_answer"] for item in self.dataset]})
+        return self.dataset
+    def doc_to_text(self, doc):
+        sentence = doc["content"]
+        doc_text = f"{sentence}\n"
+        return doc_text
+    def doc_to_target(self, doc):
+        q_id = doc["question_id"]
+        return q_id
+    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+        arguments = (ctx, self.generation_kwargs)
+        request_list = [
+            Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
+        ]
+        # sampling_arguments = (ctx, self.generation_kwargs_sampling)
+        # request_list.extend(
+        #     [
+        #         Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
+        #         for idx in range(1, self.generation_kwargs_sampling_number + 1)
+        #     ]
+        # )
+        return request_list
+    def process_results(self, doc, results):
+        response_temperature_0 = results[0]
+        # other_responses = results[1:]
+        api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
+        endpoint_list = make_config(api_config_path)
+        if self.configs["regex_pattern"]:
+            pattern = re.compile(self.configs["regex_pattern"])
+        ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
+        ref_answers = None
+        if self.configs["reference"]:
+            ref_answers = load_model_answers(ref_answer_dir)
+            ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
+        # output_files = {}
+        # models = ["custom_model"]
+        # output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
+        # for model in models:
+        #     output_files[model] = os.path.join(
+        #         output_dir,
+        #         f"{model}.jsonl",
+        #     )
+        # for output_file in output_files.values():
+        #     os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        endpoint_info = endpoint_list[self.configs["judge_model"]]
+        question = doc
+        kwargs = {}
+        kwargs["question"] = question
+        kwargs["answer"] = response_temperature_0
+        if ref_answers:
+            kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
+            assert len(kwargs["reference"]) == len(self.configs["ref_model"])
+        else:
+            kwargs["reference"] = None
+        if self.configs["baseline"]:
+            kwargs["baseline_answer"] = doc["model_answer"]
+        else:
+            kwargs["baseline_answer"] = None
+        kwargs["configs"] = self.configs
+        kwargs["endpoint_dict"] = endpoint_info
+        # kwargs["output_file"] = output_files["custom_model"]
+        kwargs["regex_pattern"] = pattern
+        scores = judgment(**kwargs)
+        return {"score": scores}
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        ##TODO implement the aggregation function to calculate elo for score
+        def get_win_rate(score_list):
+            battles = get_battles_from_scores(score_list)
+            bootstrap_online_elo = compute_mle_elo(battles)
+            stats = pd.DataFrame()
+            stats["results"] = None
+            stats["results"] = stats['results'].astype('object')
+            for i, model in enumerate(bootstrap_online_elo.index):
+                stats.at[i, "model"] = model
+                stats.at[i, "score"] = bootstrap_online_elo[model]
+            stats.sort_values(by="model", inplace=True)
+            stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
+            return stats["score"][1]
+        return {k: get_win_rate for k in ["score"]}
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {k: True for k in ["score"]}

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
         super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
-        self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {
             "temperature": 0.99,
             "do_sample": True,
-            "until": ["<im_end>", "</s>"],
             "max_length": 1024,
         }

         super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
+        self.generation_kwargs = {"until": ["<|im_end|>"], "max_length": 1024}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {
             "temperature": 0.99,
             "do_sample": True,
+            "until": ["<|im_end|>", "</s>"],
             "max_length": 1024,
         }

src/display/utils.py CHANGED Viewed

@@ -79,10 +79,11 @@ class Tasks(Enum):
     # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
     # # XXX include me back at some point
-    selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
     gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
     # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
 # These classes are for user facing column names,
@@ -115,9 +116,9 @@ for task in Tasks:
     auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
     # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
     if task.value.benchmark in MULTIPLE_CHOICEs:
         continue
     # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
@@ -187,6 +188,7 @@ class InferenceFramework(Enum):
     # "moe-infinity", hf-chat
     MoE_Infinity = ModelDetails("moe-infinity")
     HF_Chat = ModelDetails("hf-chat")
     Unknown = ModelDetails("?")
     def to_str(self):
@@ -198,12 +200,13 @@ class InferenceFramework(Enum):
             return InferenceFramework.MoE_Infinity
         if inference_framework in ["hf-chat"]:
             return InferenceFramework.HF_Chat
         return InferenceFramework.Unknown
 class GPUType(Enum):
-    H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB")
     A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
-    A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
     Unknown = ModelDetails("?")
     def to_str(self):
@@ -211,12 +214,10 @@ class GPUType(Enum):
     @staticmethod
     def from_str(gpu_type: str):
-        if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
-            return GPUType.A100_pcie
         if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
-            return GPUType.H100_pcie
-        if gpu_type in ["NVIDIA-A5000-24GB"]:
-            return GPUType.A5000
         return GPUType.Unknown
 class WeightType(Enum):

     # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
     # # XXX include me back at some point
+    # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
     gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
     # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
+    arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
 # These classes are for user facing column names,
     auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
     # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
+    # auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
+    # auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
     if task.value.benchmark in MULTIPLE_CHOICEs:
         continue
     # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
     # "moe-infinity", hf-chat
     MoE_Infinity = ModelDetails("moe-infinity")
     HF_Chat = ModelDetails("hf-chat")
+    VLLM = ModelDetails("vllm_moe")
     Unknown = ModelDetails("?")
     def to_str(self):
             return InferenceFramework.MoE_Infinity
         if inference_framework in ["hf-chat"]:
             return InferenceFramework.HF_Chat
+        if inference_framework in ["vllm_moe"]:
+            return InferenceFramework.VLLM
         return InferenceFramework.Unknown
 class GPUType(Enum):
+    A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
     A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
     Unknown = ModelDetails("?")
     def to_str(self):
     @staticmethod
     def from_str(gpu_type: str):
         if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
+            return GPUType.A100_pcie
+        if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
+            return GPUType.A100_sxm
         return GPUType.Unknown
 class WeightType(Enum):

src/leaderboard/read_evals.py CHANGED Viewed

@@ -116,7 +116,7 @@ class EvalResult:
                         multiplier = 1.0
                     if "time" in metric:
                         multiplier = 1.0
-                    if "throughput" in metric or "mfu" in metric or "mbu" in metric:
                         multiplier = 1.0
                     if "batch_" in metric or "Mem" in metric or "Util" in metric:
                         multiplier = 1
@@ -124,7 +124,10 @@ class EvalResult:
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
-                    results[benchmark][metric] = value * multiplier
         res = EvalResult(
             eval_name=result_key,

                         multiplier = 1.0
                     if "time" in metric:
                         multiplier = 1.0
+                    if "throughput" in metric:
                         multiplier = 1.0
                     if "batch_" in metric or "Mem" in metric or "Util" in metric:
                         multiplier = 1
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
+                    if value == "N/A":
+                        results[benchmark][metric] = None
+                    else:
+                        results[benchmark][metric] = value * multiplier
         res = EvalResult(
             eval_name=result_key,