chivier commited on
Commit
fe8e6f7
1 Parent(s): 0122892

sync from github

Browse files
requirements.txt CHANGED
@@ -16,7 +16,7 @@ requests
16
  semantic-version
17
  tqdm
18
  wandb
19
- transformers>=4.36.0
20
  tokenizers>=0.15.0
21
  lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
22
  accelerate
@@ -31,4 +31,6 @@ spacy==3.7.4
31
  selfcheckgpt
32
  immutabledict
33
  gputil
34
- bitsandbytes
 
 
 
16
  semantic-version
17
  tqdm
18
  wandb
19
+ transformers
20
  tokenizers>=0.15.0
21
  lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
22
  accelerate
 
31
  selfcheckgpt
32
  immutabledict
33
  gputil
34
+ bitsandbytes
35
+ openai
36
+ scikit-learn
src/backend/envs.py CHANGED
@@ -59,6 +59,7 @@ class Tasks(Enum):
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
60
  task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
61
  # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
 
62
 
63
 
64
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
60
  task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
61
  # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
62
+ task24 = Task("arena_hard", "score", "Arena Hard", 0)
63
 
64
 
65
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
src/backend/hflm_with_measurement.py CHANGED
@@ -354,6 +354,7 @@ class HFLMWithMeasurement(HFLM):
354
  linear_count += 1
355
  elif isinstance(module, DbrxExpertGLU):
356
  linear_count = 3
 
357
  # elif 'experts' not in name:
358
  # if ("gate" not in name and "router" not in name) or "gate_proj" in name:
359
  # if "gate_proj" in name:
@@ -388,8 +389,7 @@ class HFLMWithMeasurement(HFLM):
388
 
389
  precision_bytes = transfer_precision2bytes(self.precision)
390
 
391
- model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
392
- model_size_param = get_model_size(model_info=model_info, precision=self.precision)
393
 
394
  n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
395
  (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
@@ -429,7 +429,7 @@ class HFLMWithMeasurement(HFLM):
429
 
430
  ffn_params = n_layers * d_ff * linear_count * d_model
431
 
432
- shared_params = model_size_param * 1e9 - num_experts * ffn_params
433
 
434
  model_size = shared_params + n_experts_per_tok * ffn_params
435
 
 
354
  linear_count += 1
355
  elif isinstance(module, DbrxExpertGLU):
356
  linear_count = 3
357
+ element_wise_mul = 1
358
  # elif 'experts' not in name:
359
  # if ("gate" not in name and "router" not in name) or "gate_proj" in name:
360
  # if "gate_proj" in name:
 
389
 
390
  precision_bytes = transfer_precision2bytes(self.precision)
391
 
392
+ model_size_param = sum(p.numel() for p in self.model.parameters())
 
393
 
394
  n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
395
  (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
 
429
 
430
  ffn_params = n_layers * d_ff * linear_count * d_model
431
 
432
+ shared_params = model_size_param - num_experts * ffn_params
433
 
434
  model_size = shared_params + n_experts_per_tok * ffn_params
435
 
src/backend/run_eval_suite.py CHANGED
@@ -25,8 +25,8 @@ def process_results_decorator(func):
25
  result_dict["end_to_end_time"] = end_to_end_time
26
  result_dict["prefilling_time"] = prefilling_time
27
  result_dict["decoding_throughput"] = decoding_throughput
28
- result_dict["mfu"] = mfu * 100
29
- result_dict["mbu"] = mbu * 100
30
  return result_dict
31
  return wrapper
32
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
 
25
  result_dict["end_to_end_time"] = end_to_end_time
26
  result_dict["prefilling_time"] = prefilling_time
27
  result_dict["decoding_throughput"] = decoding_throughput
28
+ result_dict["mfu"] = mfu
29
+ result_dict["mbu"] = mbu
30
  return result_dict
31
  return wrapper
32
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
src/backend/tasks/arena_hard/__init__.py ADDED
File without changes
src/backend/tasks/arena_hard/arena_hard.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task: arena_hard
2
+ class: !function task.ArenaHard
src/backend/tasks/arena_hard/arena_judgment.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
3
+ under the Apache 2.0 License from the arena-hard project.
4
+ (https://github.com/lm-sys/arena-hard)
5
+ Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
6
+ See the NOTICE file distributed with this work for additional
7
+ information regarding copyright ownership.
8
+ '''
9
+
10
+ import pandas as pd
11
+ from tqdm import tqdm
12
+ import numpy as np
13
+ from sklearn.linear_model import LogisticRegression
14
+ import math
15
+ from collections import defaultdict
16
+ from tqdm import tqdm
17
+
18
+ from src.backend.tasks.arena_hard.arena_utils import (
19
+ chat_completion_openai,
20
+ load_questions,
21
+ load_model_answers,
22
+ get_endpoint,
23
+ make_config,
24
+ )
25
+
26
+
27
+ def get_score(judgment, pattern, pairwise=True):
28
+ matches = pattern.findall(judgment)
29
+ matches = [m for m in matches if m != ""]
30
+ if len(set(matches)) == 0:
31
+ return None, True
32
+ elif len(set(matches)) == 1:
33
+ if pairwise:
34
+ return matches[0].strip("\n"), False
35
+ return int(matches[0])
36
+ else:
37
+ return None, False
38
+
39
+
40
+ # get answer from model
41
+ def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
42
+ api_dict = get_endpoint(endpoint_dict["endpoints"])
43
+
44
+ # if endpoint_dict["api_type"] == "anthropic":
45
+ # output = chat_completion_anthropic(model, conv, temperature, max_tokens)
46
+ # elif endpoint_dict["api_type"] == "azure":
47
+ # output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
48
+
49
+ output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
50
+ return output
51
+
52
+
53
+ def judgment(**args):
54
+ question = args["question"]
55
+ answer = args["answer"]
56
+ reference = args["reference"]
57
+ baseline = args["baseline_answer"]
58
+ configs = args["configs"]
59
+ # output_file = args["output_file"]
60
+ model = configs["judge_model"]
61
+
62
+ num_games = 2 if configs["pairwise"] else 1
63
+
64
+ # output = {
65
+ # "question_id":question["question_id"],
66
+ # "judge": model,
67
+ # "model": "custom_model",
68
+ # "games":[]
69
+ # }
70
+ output = [question["question_id"]]
71
+
72
+ for game in range(num_games):
73
+ conv = [{"role": "system", "content": configs["system_prompt"]}]
74
+
75
+ for template in configs["prompt_template"]:
76
+ prompt_args = {}
77
+
78
+ prompt_args[f"question_{1}"] = question["content"]
79
+ base = 1
80
+
81
+ if baseline:
82
+ if game % 2 == 1: # swap position
83
+ temp = baseline
84
+ baseline = answer
85
+ answer = temp
86
+
87
+ if game == 0:
88
+ for i, turn in enumerate(baseline["choices"][0]["turns"]):
89
+ prompt_args[f"answer_{i+1}"] = turn["content"]
90
+ base += 1
91
+
92
+ if game == 1:
93
+ prompt_args[f"answer_{1}"] = baseline
94
+ base += 1
95
+
96
+ if answer:
97
+ prompt_args[f"answer_{base}"] = answer
98
+
99
+ if reference:
100
+ for j, ref_answer in enumerate(reference):
101
+ for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
102
+ prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
103
+
104
+ user_prompt = template.format(**prompt_args)
105
+ conv.append({"role": "user", "content": user_prompt})
106
+
107
+ judgment = ""
108
+ for _ in range(2):
109
+ new_judgment = get_answer(
110
+ model,
111
+ conv,
112
+ configs["temperature"],
113
+ configs["max_tokens"],
114
+ args["endpoint_dict"],
115
+ )
116
+
117
+ judgment += ("\n" + new_judgment)
118
+
119
+ score, try_again = get_score(judgment, args["regex_pattern"])
120
+
121
+ conv.append({"role": "assistant", "content": new_judgment})
122
+
123
+ if not try_again:
124
+ break
125
+
126
+ conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
127
+ print("Finish judgment!!!")
128
+ # result = {
129
+ # "user_prompt": conv[1]["content"],
130
+ # "judgment": judgment,
131
+ # "score":score
132
+ # }
133
+ output.append(score)
134
+
135
+ return output
136
+
137
+ def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
138
+ arena_hard_battles = pd.DataFrame()
139
+
140
+ print("Turning score list into battles...")
141
+
142
+ for scores in tqdm(score_list):
143
+ question_id, score1, score2 = scores
144
+
145
+ # Process game 1
146
+ output = {"question_id": question_id,
147
+ "model_a": "gpt-4-0314",
148
+ "model_b": f"custom_model"} # Unique identifier for model
149
+ weight = 1
150
+ if score1 == "A=B":
151
+ output["winner"] = "tie"
152
+ elif score1 == "A>B":
153
+ output["winner"] = "model_a"
154
+ elif score1 == "A>>B":
155
+ output["winner"] = "model_a"
156
+ weight = WEIGHT
157
+ elif score1 == "B>A":
158
+ output["winner"] = "model_b"
159
+ elif score1 == "B>>A":
160
+ output["winner"] = "model_b"
161
+ weight = WEIGHT
162
+ else:
163
+ weight = 0
164
+
165
+ if weight:
166
+ arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
167
+
168
+ if not first_game_only:
169
+ # Process game 2
170
+ output = {"question_id": question_id,
171
+ "model_a": "gpt-4-0314",
172
+ "model_b": f"custom_model"} # Unique identifier for model
173
+ weight = 1
174
+ if score2 == "A=B":
175
+ output["winner"] = "tie"
176
+ elif score2 == "A>B":
177
+ output["winner"] = "model_b"
178
+ elif score2 == "A>>B":
179
+ output["winner"] = "model_b"
180
+ weight = WEIGHT
181
+ elif score2 == "B>A":
182
+ output["winner"] = "model_a"
183
+ elif score2 == "B>>A":
184
+ output["winner"] = "model_a"
185
+ weight = WEIGHT
186
+ else:
187
+ weight = 0
188
+
189
+ if weight:
190
+ arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
191
+
192
+ arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
193
+ return arena_hard_battles
194
+
195
+ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
196
+ models = pd.concat([df["model_a"], df["model_b"]]).unique()
197
+ models = pd.Series(np.arange(len(models)), index=models)
198
+
199
+ LOW_RATING = 100
200
+ # duplicate battles
201
+ df = pd.concat([df, df], ignore_index=True)
202
+ p = len(models.index)
203
+ n = df.shape[0]
204
+
205
+ X = np.zeros([n, p])
206
+ X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
207
+ X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
208
+
209
+ # one A win => two A win
210
+ Y = np.zeros(n)
211
+ Y[df["winner"] == "model_a"] = 1.0
212
+
213
+ # one tie => one A win + one B win
214
+ # find tie + tie (both bad) index
215
+ tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
216
+ tie_idx[len(tie_idx)//2:] = False
217
+ Y[tie_idx] = 1.0
218
+
219
+ if len(np.unique(Y)) == 1:
220
+ # If there's only one class in the data, assign default ratings
221
+ elo_scores = np.full(p, LOW_RATING)
222
+ elo_scores[models["gpt-4-0314"]] = INIT_RATING
223
+ else:
224
+ lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
225
+ lr.fit(X,Y)
226
+
227
+ elo_scores = SCALE * lr.coef_[0] + INIT_RATING
228
+
229
+ # set anchor as gpt-4-0314 = 1000
230
+ if "gpt-4-0314" in models.index:
231
+ elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
232
+ return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
233
+
234
+ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
235
+ names = sorted(list(elo_ratings.keys()))
236
+ wins = defaultdict(lambda: defaultdict(lambda: 0))
237
+ for a in names:
238
+ for b in names:
239
+ ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
240
+ wins[a][b] = ea
241
+ wins[b][a] = 1 - ea
242
+
243
+ data = {
244
+ a: [wins[a][b] if a != b else np.NAN for b in names]
245
+ for a in names
246
+ }
247
+
248
+ df = pd.DataFrame(data, index=names)
249
+ df.index.name = "model_a"
250
+ df.columns.name = "model_b"
251
+ return df.T
252
+
253
+ def get_win_rate_column(df, column, baseline="gpt-4-0314"):
254
+ to_dict = df[["model", column]].set_index("model").to_dict()[column]
255
+ win_rate_table = predict_win_rate(to_dict)
256
+ return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
src/backend/tasks/arena_hard/arena_utils.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
3
+ under the Apache 2.0 License from the arena-hard project.
4
+ (https://github.com/lm-sys/arena-hard)
5
+ Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
6
+ See the NOTICE file distributed with this work for additional
7
+ information regarding copyright ownership.
8
+ '''
9
+
10
+
11
+ import os
12
+ import json
13
+ import time
14
+ import yaml
15
+ import random
16
+
17
+ from typing import Optional
18
+ from glob import glob
19
+
20
+ # API setting constants
21
+ API_MAX_RETRY = 16
22
+ API_RETRY_SLEEP = 10
23
+ API_ERROR_OUTPUT = "$ERROR$"
24
+
25
+
26
+ OPENAI_MODEL_LIST = (
27
+ "gpt-3.5-turbo",
28
+ "gpt-3.5-turbo-0301",
29
+ "gpt-3.5-turbo-0613",
30
+ "gpt-3.5-turbo-0613-verbose",
31
+ "gpt-3.5-turbo-1106",
32
+ "gpt-3.5-turbo-0125",
33
+ "gpt-4",
34
+ "gpt-4-0314",
35
+ "gpt-4-0613",
36
+ "gpt-4-turbo",
37
+ "gpt-4-1106-preview",
38
+ "gpt-4-0125-preview",
39
+ )
40
+
41
+
42
+ temperature_config = {
43
+ "writing": 0.7,
44
+ "roleplay": 0.7,
45
+ "extraction": 0.0,
46
+ "math": 0.0,
47
+ "coding": 0.0,
48
+ "reasoning": 0.0,
49
+ "stem": 0.1,
50
+ "humanities": 0.1,
51
+ }
52
+
53
+
54
+ def load_questions(question_file: str):
55
+ """Load questions from a file."""
56
+ questions = []
57
+ with open(question_file, "r") as ques_file:
58
+ for line in ques_file:
59
+ if line:
60
+ questions.append(json.loads(line))
61
+ return questions
62
+
63
+
64
+ def load_model_answers(answer_dir: str):
65
+ """Load model answers.
66
+
67
+ The return value is a python dict of type:
68
+ Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
69
+ """
70
+ filenames = glob(os.path.join(answer_dir, "*.jsonl"))
71
+ filenames.sort()
72
+ model_answers = {}
73
+
74
+ for filename in filenames:
75
+ model_name = os.path.basename(filename)[:-6]
76
+ answer = {}
77
+ with open(filename) as fin:
78
+ for line in fin:
79
+ line = json.loads(line)
80
+ answer[line["question_id"]] = line
81
+ model_answers[model_name] = answer
82
+
83
+ return model_answers
84
+
85
+
86
+ def get_endpoint(endpoint_list):
87
+ if endpoint_list is None:
88
+ return None
89
+ assert endpoint_list is not None
90
+ # randomly pick one
91
+ api_dict = random.choices(
92
+ endpoint_list
93
+ )[0]
94
+ return api_dict
95
+
96
+
97
+ # load config args from config yaml files
98
+ def make_config(config_file: str) -> dict:
99
+ config_kwargs = {}
100
+ with open(config_file, "r") as f:
101
+ config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
102
+
103
+ return config_kwargs
104
+
105
+
106
+ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
107
+ import openai
108
+ if api_dict:
109
+ client = openai.OpenAI(
110
+ base_url=api_dict["api_base"],
111
+ api_key=api_dict["api_key"],
112
+ )
113
+ else:
114
+ client = openai.OpenAI()
115
+
116
+ output = API_ERROR_OUTPUT
117
+ for _ in range(API_MAX_RETRY):
118
+ try:
119
+ # print(messages)
120
+ completion = client.chat.completions.create(
121
+ model=model,
122
+ messages=messages,
123
+ temperature=temperature,
124
+ max_tokens=max_tokens
125
+ )
126
+ output = completion.choices[0].message.content
127
+ break
128
+ except openai.RateLimitError as e:
129
+ print(type(e), e)
130
+ time.sleep(API_RETRY_SLEEP)
131
+ except openai.BadRequestError as e:
132
+ print(messages)
133
+ print(type(e), e)
134
+ except KeyError:
135
+ print(type(e), e)
136
+ break
137
+
138
+ return output
139
+
140
+
141
+ # def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
142
+ # import openai
143
+ # from openai import AzureOpenAI
144
+
145
+ # api_base = api_dict["api_base"]
146
+ # client = AzureOpenAI(
147
+ # azure_endpoint = api_base,
148
+ # api_key= api_dict["api_key"],
149
+ # api_version=api_dict["api_version"],
150
+ # timeout=240,
151
+ # max_retries=2
152
+ # )
153
+
154
+ # output = API_ERROR_OUTPUT
155
+ # for _ in range(API_MAX_RETRY):
156
+ # try:
157
+ # response = client.chat.completions.create(
158
+ # model=model,
159
+ # messages=messages,
160
+ # n=1,
161
+ # temperature=temperature,
162
+ # max_tokens=max_tokens,
163
+ # seed=42,
164
+ # )
165
+ # output = response.choices[0].message.content
166
+ # break
167
+ # except openai.RateLimitError as e:
168
+ # print(type(e), e)
169
+ # time.sleep(API_RETRY_SLEEP)
170
+ # except openai.BadRequestError as e:
171
+ # print(type(e), e)
172
+ # break
173
+ # except KeyError:
174
+ # print(type(e), e)
175
+ # break
176
+
177
+ # return output
178
+
179
+
180
+ # def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
181
+ # import anthropic
182
+
183
+ # if api_dict:
184
+ # api_key = api_dict["api_key"]
185
+ # else:
186
+ # api_key = os.environ["ANTHROPIC_API_KEY"]
187
+
188
+ # sys_msg = ""
189
+ # if messages[0]["role"] == "system":
190
+ # sys_msg = messages[0]["content"]
191
+ # messages = messages[1:]
192
+
193
+ # output = API_ERROR_OUTPUT
194
+ # for _ in range(API_MAX_RETRY):
195
+ # try:
196
+ # # print(sys_msg)
197
+ # c = anthropic.Anthropic(api_key=api_key)
198
+ # response = c.messages.create(
199
+ # model=model,
200
+ # messages=messages,
201
+ # stop_sequences=[anthropic.HUMAN_PROMPT],
202
+ # max_tokens=max_tokens,
203
+ # temperature=temperature,
204
+ # system=sys_msg
205
+ # )
206
+ # output = response.content[0].text
207
+ # break
208
+ # except anthropic.APIError as e:
209
+ # print(type(e), e)
210
+ # time.sleep(API_RETRY_SLEEP)
211
+ # return output
212
+
213
+
214
+ # def chat_completion_mistral(model, messages, temperature, max_tokens):
215
+ # from mistralai.client import MistralClient
216
+ # from mistralai.models.chat_completion import ChatMessage
217
+ # from mistralai.exceptions import MistralException
218
+
219
+ # api_key = os.environ["MISTRAL_API_KEY"]
220
+ # client = MistralClient(api_key=api_key)
221
+
222
+ # prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
223
+
224
+ # output = API_ERROR_OUTPUT
225
+ # for _ in range(API_MAX_RETRY):
226
+ # try:
227
+ # chat_response = client.chat(
228
+ # model=model,
229
+ # messages=prompts,
230
+ # temperature=temperature,
231
+ # max_tokens=max_tokens,
232
+ # )
233
+ # output = chat_response.choices[0].message.content
234
+ # break
235
+ # except MistralException as e:
236
+ # print(type(e), e)
237
+ # break
238
+
239
+ # return output
240
+
241
+
242
+ # def chat_completion_gemini(model, messages, temperature, max_tokens):
243
+ # import google.generativeai as genai
244
+ # genai.configure(api_key=os.environ["GEMINI_API_KEY"])
245
+
246
+ # safety_settings = [
247
+ # {
248
+ # "category": "HARM_CATEGORY_HARASSMENT",
249
+ # "threshold": "BLOCK_NONE"
250
+ # },
251
+ # {
252
+ # "category": "HARM_CATEGORY_HATE_SPEECH",
253
+ # "threshold": "BLOCK_NONE"
254
+ # },
255
+ # {
256
+ # "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
257
+ # "threshold": "BLOCK_NONE"
258
+ # },
259
+ # {
260
+ # "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
261
+ # "threshold": "BLOCK_NONE"
262
+ # },
263
+ # ]
264
+
265
+ # # Set up the model
266
+ # generation_config = {
267
+ # "temperature": temperature,
268
+ # "top_p": 1,
269
+ # "top_k": 1,
270
+ # "max_output_tokens": max_tokens,
271
+ # }
272
+
273
+ # output = API_ERROR_OUTPUT
274
+ # for _ in range(API_MAX_RETRY):
275
+ # try:
276
+ # gemini = genai.GenerativeModel(
277
+ # model_name=model,
278
+ # generation_config=generation_config,
279
+ # safety_settings=safety_settings)
280
+
281
+ # convo = gemini.start_chat(history=[])
282
+
283
+ # convo.send_message(messages)
284
+ # output = convo.last.text
285
+ # break
286
+ # except genai.types.generation_types.StopCandidateException as e:
287
+ # print(type(e), e)
288
+ # break
289
+ # except Exception as e:
290
+ # print(type(e), e)
291
+ # time.sleep(API_RETRY_SLEEP)
292
+
293
+ # return output
294
+
295
+
296
+ # def chat_completion_cohere(model, messages, temperature, max_tokens):
297
+ # import cohere
298
+
299
+ # co = cohere.Client(os.environ["COHERE_API_KEY"])
300
+ # assert len(messages) > 0
301
+
302
+ # template_map = {"system":"SYSTEM",
303
+ # "assistant":"CHATBOT",
304
+ # "user":"USER"}
305
+
306
+ # assert messages[-1]["role"] == "user"
307
+ # prompt = messages[-1]["content"]
308
+
309
+ # if len(messages) > 1:
310
+ # history = []
311
+ # for message in messages[:-1]:
312
+ # history.append({"role":template_map[message["role"]], "message":message["content"]})
313
+ # else:
314
+ # history = None
315
+
316
+ # output = API_ERROR_OUTPUT
317
+ # for _ in range(API_MAX_RETRY):
318
+ # try:
319
+ # response = co.chat(
320
+ # message=prompt,
321
+ # model=model,
322
+ # temperature=temperature,
323
+ # max_tokens=max_tokens,
324
+ # chat_history=history,
325
+ # )
326
+ # output = response.text
327
+ # break
328
+ # except cohere.core.api_error.ApiError as e:
329
+ # print(type(e), e)
330
+ # raise
331
+ # except Exception as e:
332
+ # print(type(e), e)
333
+ # break
334
+
335
+ # return output
336
+
337
+
338
+ def reorg_answer_file(answer_file):
339
+ """Sort by question id and de-duplication"""
340
+ answers = {}
341
+ with open(answer_file, "r") as fin:
342
+ for l in fin:
343
+ qid = json.loads(l)["question_id"]
344
+ answers[qid] = l
345
+
346
+ qids = sorted(list(answers.keys()))
347
+ with open(answer_file, "w") as fout:
348
+ for qid in qids:
349
+ fout.write(answers[qid])
src/backend/tasks/arena_hard/configs/api_config.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gpt-3.5-turbo:
2
+ # model_name: gpt-3.5-turbo
3
+ # endpoints: null
4
+ # api_type: openai
5
+ # parallel: 8
6
+
7
+ gpt-4-1106-preview:
8
+ model_name: gpt-4-1106-preview
9
+ endpoints: null
10
+ api_type: openai
11
+ parallel: 8
12
+
13
+ # llama3-7b:
14
+ # model_name: llama3-7b
15
+ # endpoints: null
16
+ # api_type: openai
17
+ # parallel: 8
src/backend/tasks/arena_hard/configs/judge_config.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: judgment config file for Arena Hard
2
+
3
+ bench_name: arena-hard-v0.1
4
+
5
+ # Arena Hard default
6
+ judge_model: gpt-4-1106-preview
7
+ # judge_model: gpt-3.5-turbo
8
+ reference: False # Optional
9
+ ref_model: null
10
+
11
+ baseline: True
12
+ baseline_model: gpt-4-0314
13
+
14
+ pairwise: True
15
+ temperature: 0
16
+ max_tokens: 4096
17
+
18
+ regex_pattern: \[\[([AB<>=]+)\]\]
19
+
20
+ system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
21
+
22
+ prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
23
+
24
+ # Add your model below for evaluation
25
+ # model_list:
26
+ # - gpt-3.5-turbo-0125
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
src/backend/tasks/arena_hard/question.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
src/backend/tasks/arena_hard/task.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union, List
3
+
4
+ from lm_eval.api.task import ConfigurableTask
5
+ from lm_eval.api.instance import Instance
6
+
7
+ # from lm_eval.api.registry import register_task
8
+ from lm_eval.api.metrics import mean
9
+
10
+ from src.backend.envs import DEVICE
11
+
12
+ import pandas as pd
13
+
14
+ from src.backend.tasks.measurement_task_utils import measure_system_metrics
15
+ import json
16
+
17
+ from typing import (
18
+ Any,
19
+ Dict,
20
+ List,
21
+ Optional,
22
+ Union,
23
+ )
24
+
25
+ from datasets import Dataset
26
+ import re
27
+
28
+ from src.backend.tasks.arena_hard.arena_utils import (
29
+ load_questions,
30
+ load_questions,
31
+ load_model_answers,
32
+ make_config,
33
+ )
34
+
35
+ from src.backend.tasks.arena_hard.arena_judgment import (
36
+ judgment,
37
+ get_battles_from_scores,
38
+ compute_mle_elo,
39
+ predict_win_rate,
40
+ get_win_rate_column
41
+ )
42
+
43
+ def load_questions(question_file: str):
44
+ """Load questions from a file."""
45
+ questions = []
46
+ with open(question_file, "r") as ques_file:
47
+ for line in ques_file:
48
+ if line:
49
+ questions.append(json.loads(line))
50
+ return questions
51
+
52
+ def download_wrapper(func):
53
+ def download(self, *args, **kwargs):
54
+ print("Using Arena Hard, No need to download")
55
+ return download
56
+
57
+ original_download = ConfigurableTask.download
58
+ ConfigurableTask.download = download_wrapper(original_download)
59
+ # @register_task("selfcheckgpt")
60
+ @measure_system_metrics
61
+ class ArenaHard(ConfigurableTask):
62
+ VERSION = 0.0
63
+ OUTPUT_TYPE = "generate_until"
64
+ data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
65
+ judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
66
+ configs = make_config(judge_config_path)
67
+ model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
68
+ model_answers = load_model_answers(model_ans_dir)
69
+ data = load_questions(data_path)
70
+
71
+ def __init__(self):
72
+ super().__init__(config={"metadata": {"version": self.VERSION}})
73
+ # these end tokens are hard coded because of the current limitaion of the llm-eval.
74
+ # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
75
+ self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_length": 4096}
76
+ # self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
77
+ # self.generation_kwargs_sampling = {
78
+ # "temperature": 0.99,
79
+ # "do_sample": True,
80
+ # "until": ["<im_end>", "<im_end>"],
81
+ # "max_length": 1024,
82
+ # }
83
+
84
+ def transform_data(self, data):
85
+ transformed_data = []
86
+ for i in range(len(data)):
87
+ if self.configs["baseline"]:
88
+ baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
89
+ else:
90
+ baseline_answer = None
91
+ transformed_item = {
92
+ "question_id": data[i]["question_id"],
93
+ "content": data[i]["turns"][0]["content"], # Assuming you want the first turn's content
94
+ "model_answer": baseline_answer
95
+ }
96
+ transformed_data.append(transformed_item)
97
+ return transformed_data
98
+
99
+ def has_training_docs(self):
100
+ return False
101
+
102
+ def has_validation_docs(self):
103
+ return True
104
+
105
+ def has_test_docs(self):
106
+ return False
107
+
108
+ def validation_docs(self):
109
+ self.dataset = self.transform_data(self.data)
110
+ self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
111
+ "content": [item["content"] for item in self.dataset],
112
+ "model_answer": [item["model_answer"] for item in self.dataset]})
113
+ return self.dataset
114
+
115
+ def doc_to_text(self, doc):
116
+ sentence = doc["content"]
117
+ doc_text = f"{sentence}\n"
118
+ return doc_text
119
+
120
+ def doc_to_target(self, doc):
121
+ q_id = doc["question_id"]
122
+ return q_id
123
+
124
+ def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
125
+ arguments = (ctx, self.generation_kwargs)
126
+ request_list = [
127
+ Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
128
+ ]
129
+ # sampling_arguments = (ctx, self.generation_kwargs_sampling)
130
+ # request_list.extend(
131
+ # [
132
+ # Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
133
+ # for idx in range(1, self.generation_kwargs_sampling_number + 1)
134
+ # ]
135
+ # )
136
+ return request_list
137
+
138
+ def process_results(self, doc, results):
139
+ response_temperature_0 = results[0]
140
+ # other_responses = results[1:]
141
+ api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
142
+ endpoint_list = make_config(api_config_path)
143
+
144
+ if self.configs["regex_pattern"]:
145
+ pattern = re.compile(self.configs["regex_pattern"])
146
+
147
+ ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
148
+
149
+ ref_answers = None
150
+ if self.configs["reference"]:
151
+ ref_answers = load_model_answers(ref_answer_dir)
152
+ ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
153
+
154
+ # output_files = {}
155
+ # models = ["custom_model"]
156
+ # output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
157
+ # for model in models:
158
+ # output_files[model] = os.path.join(
159
+ # output_dir,
160
+ # f"{model}.jsonl",
161
+ # )
162
+
163
+ # for output_file in output_files.values():
164
+ # os.makedirs(os.path.dirname(output_file), exist_ok=True)
165
+
166
+ endpoint_info = endpoint_list[self.configs["judge_model"]]
167
+
168
+ question = doc
169
+ kwargs = {}
170
+ kwargs["question"] = question
171
+ kwargs["answer"] = response_temperature_0
172
+ if ref_answers:
173
+ kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
174
+ assert len(kwargs["reference"]) == len(self.configs["ref_model"])
175
+ else:
176
+ kwargs["reference"] = None
177
+
178
+ if self.configs["baseline"]:
179
+ kwargs["baseline_answer"] = doc["model_answer"]
180
+ else:
181
+ kwargs["baseline_answer"] = None
182
+ kwargs["configs"] = self.configs
183
+ kwargs["endpoint_dict"] = endpoint_info
184
+ # kwargs["output_file"] = output_files["custom_model"]
185
+ kwargs["regex_pattern"] = pattern
186
+
187
+ scores = judgment(**kwargs)
188
+ return {"score": scores}
189
+
190
+ def aggregation(self):
191
+ """
192
+ :returns: {str: [float] -> float}
193
+ A dictionary where keys are the names of submetrics and values are
194
+ functions that aggregate a list of metrics
195
+ """
196
+ ##TODO implement the aggregation function to calculate elo for score
197
+ def get_win_rate(score_list):
198
+ battles = get_battles_from_scores(score_list)
199
+ bootstrap_online_elo = compute_mle_elo(battles)
200
+ stats = pd.DataFrame()
201
+ stats["results"] = None
202
+ stats["results"] = stats['results'].astype('object')
203
+ for i, model in enumerate(bootstrap_online_elo.index):
204
+ stats.at[i, "model"] = model
205
+ stats.at[i, "score"] = bootstrap_online_elo[model]
206
+
207
+ stats.sort_values(by="model", inplace=True)
208
+ stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
209
+
210
+ return stats["score"][1]
211
+
212
+ return {k: get_win_rate for k in ["score"]}
213
+
214
+ def higher_is_better(self):
215
+ """
216
+ :returns: {str: bool}
217
+ A dictionary where keys are the names of submetrics and values are
218
+ whether a higher value of the submetric is better
219
+ """
220
+ return {k: True for k in ["score"]}
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
27
  super().__init__(config={"metadata": {"version": self.VERSION}})
28
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
29
  # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
30
- self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
31
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
32
  self.generation_kwargs_sampling = {
33
  "temperature": 0.99,
34
  "do_sample": True,
35
- "until": ["<im_end>", "</s>"],
36
  "max_length": 1024,
37
  }
38
 
 
27
  super().__init__(config={"metadata": {"version": self.VERSION}})
28
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
29
  # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
30
+ self.generation_kwargs = {"until": ["<|im_end|>"], "max_length": 1024}
31
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
32
  self.generation_kwargs_sampling = {
33
  "temperature": 0.99,
34
  "do_sample": True,
35
+ "until": ["<|im_end|>", "</s>"],
36
  "max_length": 1024,
37
  }
38
 
src/display/utils.py CHANGED
@@ -79,10 +79,11 @@ class Tasks(Enum):
79
  # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
80
 
81
  # # XXX include me back at some point
82
- selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
83
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
84
  gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
85
  # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
 
86
 
87
 
88
  # These classes are for user facing column names,
@@ -115,9 +116,9 @@ for task in Tasks:
115
  auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
116
  auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
117
  # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
118
- auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
119
  auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
120
- auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
121
  if task.value.benchmark in MULTIPLE_CHOICEs:
122
  continue
123
  # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
@@ -187,6 +188,7 @@ class InferenceFramework(Enum):
187
  # "moe-infinity", hf-chat
188
  MoE_Infinity = ModelDetails("moe-infinity")
189
  HF_Chat = ModelDetails("hf-chat")
 
190
  Unknown = ModelDetails("?")
191
 
192
  def to_str(self):
@@ -198,12 +200,13 @@ class InferenceFramework(Enum):
198
  return InferenceFramework.MoE_Infinity
199
  if inference_framework in ["hf-chat"]:
200
  return InferenceFramework.HF_Chat
 
 
201
  return InferenceFramework.Unknown
202
 
203
  class GPUType(Enum):
204
- H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB")
205
  A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
206
- A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
207
  Unknown = ModelDetails("?")
208
 
209
  def to_str(self):
@@ -211,12 +214,10 @@ class GPUType(Enum):
211
 
212
  @staticmethod
213
  def from_str(gpu_type: str):
214
- if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
215
- return GPUType.A100_pcie
216
  if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
217
- return GPUType.H100_pcie
218
- if gpu_type in ["NVIDIA-A5000-24GB"]:
219
- return GPUType.A5000
220
  return GPUType.Unknown
221
 
222
  class WeightType(Enum):
 
79
  # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
80
 
81
  # # XXX include me back at some point
82
+ # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
83
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
84
  gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
85
  # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
86
+ arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
87
 
88
 
89
  # These classes are for user facing column names,
 
116
  auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
117
  auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
118
  # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
119
+ # auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
120
  auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
121
+ # auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
122
  if task.value.benchmark in MULTIPLE_CHOICEs:
123
  continue
124
  # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
 
188
  # "moe-infinity", hf-chat
189
  MoE_Infinity = ModelDetails("moe-infinity")
190
  HF_Chat = ModelDetails("hf-chat")
191
+ VLLM = ModelDetails("vllm_moe")
192
  Unknown = ModelDetails("?")
193
 
194
  def to_str(self):
 
200
  return InferenceFramework.MoE_Infinity
201
  if inference_framework in ["hf-chat"]:
202
  return InferenceFramework.HF_Chat
203
+ if inference_framework in ["vllm_moe"]:
204
+ return InferenceFramework.VLLM
205
  return InferenceFramework.Unknown
206
 
207
  class GPUType(Enum):
208
+ A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
209
  A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
 
210
  Unknown = ModelDetails("?")
211
 
212
  def to_str(self):
 
214
 
215
  @staticmethod
216
  def from_str(gpu_type: str):
 
 
217
  if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
218
+ return GPUType.A100_pcie
219
+ if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
220
+ return GPUType.A100_sxm
221
  return GPUType.Unknown
222
 
223
  class WeightType(Enum):
src/leaderboard/read_evals.py CHANGED
@@ -116,7 +116,7 @@ class EvalResult:
116
  multiplier = 1.0
117
  if "time" in metric:
118
  multiplier = 1.0
119
- if "throughput" in metric or "mfu" in metric or "mbu" in metric:
120
  multiplier = 1.0
121
  if "batch_" in metric or "Mem" in metric or "Util" in metric:
122
  multiplier = 1
@@ -124,7 +124,10 @@ class EvalResult:
124
 
125
  # print('RESULTS', data['results'])
126
  # print('XXX', benchmark, metric, value, multiplier)
127
- results[benchmark][metric] = value * multiplier
 
 
 
128
 
129
  res = EvalResult(
130
  eval_name=result_key,
 
116
  multiplier = 1.0
117
  if "time" in metric:
118
  multiplier = 1.0
119
+ if "throughput" in metric:
120
  multiplier = 1.0
121
  if "batch_" in metric or "Mem" in metric or "Util" in metric:
122
  multiplier = 1
 
124
 
125
  # print('RESULTS', data['results'])
126
  # print('XXX', benchmark, metric, value, multiplier)
127
+ if value == "N/A":
128
+ results[benchmark][metric] = None
129
+ else:
130
+ results[benchmark][metric] = value * multiplier
131
 
132
  res = EvalResult(
133
  eval_name=result_key,