future-xy
commited on
Commit
•
88d1c0e
1
Parent(s):
1ae96c8
fix display
Browse files- backend-cli.py +2 -3
- src/backend/tasks/measurement_task_utils.py +5 -9
- src/display/utils.py +4 -4
- src/leaderboard/read_evals.py +4 -1
- src/populate.py +12 -4
backend-cli.py
CHANGED
@@ -12,7 +12,6 @@ from src.backend.run_eval_suite import run_evaluation
|
|
12 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
13 |
from src.backend.sort_queue import sort_models_by_priority
|
14 |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
|
15 |
-
LIMIT=2
|
16 |
from src.backend.manage_requests import EvalRequest
|
17 |
from src.leaderboard.read_evals import EvalResult
|
18 |
|
@@ -150,10 +149,10 @@ def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
|
|
150 |
else:
|
151 |
raise
|
152 |
|
153 |
-
print("RESULTS", results)
|
154 |
|
155 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
156 |
-
print(dumped)
|
157 |
|
158 |
output_path = os.path.join(
|
159 |
EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
|
|
|
12 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
13 |
from src.backend.sort_queue import sort_models_by_priority
|
14 |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
|
|
|
15 |
from src.backend.manage_requests import EvalRequest
|
16 |
from src.leaderboard.read_evals import EvalResult
|
17 |
|
|
|
149 |
else:
|
150 |
raise
|
151 |
|
152 |
+
# print("RESULTS", results)
|
153 |
|
154 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
155 |
+
# print(dumped)
|
156 |
|
157 |
output_path = os.path.join(
|
158 |
EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
|
src/backend/tasks/measurement_task_utils.py
CHANGED
@@ -8,21 +8,17 @@ def process_results_decorator(func):
|
|
8 |
def wrapper(self, doc, results, *args, **kwargs):
|
9 |
# We process the results here
|
10 |
processed_results = [r[0] for r in results]
|
11 |
-
|
12 |
-
# end_to_end_time = end_to_end_time / batch_size
|
13 |
-
# prefilling_time = prefilling_time / batch_size
|
14 |
-
# token_per_sec = output_length / (decoding_time / batch_size)
|
15 |
|
16 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
17 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
18 |
-
|
19 |
-
print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time},
|
20 |
|
21 |
# Now call the original process_results with the processed results
|
22 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
23 |
result_dict["end_to_end_time"] = end_to_end_time
|
24 |
result_dict["prefilling_time"] = prefilling_time
|
25 |
-
result_dict["
|
26 |
return result_dict
|
27 |
return wrapper
|
28 |
|
@@ -33,7 +29,7 @@ def aggregation_decorator(func):
|
|
33 |
aggregation_list = func(self, *args, **kwargs)
|
34 |
aggregation_list["end_to_end_time"] = mean
|
35 |
aggregation_list["prefilling_time"] = mean
|
36 |
-
aggregation_list["
|
37 |
return aggregation_list
|
38 |
return wrapper
|
39 |
|
@@ -44,7 +40,7 @@ def higher_is_better_decorator(func):
|
|
44 |
higher_is_better_dict = func(self, *args, **kwargs)
|
45 |
higher_is_better_dict["end_to_end_time"] = False
|
46 |
higher_is_better_dict["prefilling_time"] = False
|
47 |
-
higher_is_better_dict["
|
48 |
return higher_is_better_dict
|
49 |
return wrapper
|
50 |
|
|
|
8 |
def wrapper(self, doc, results, *args, **kwargs):
|
9 |
# We process the results here
|
10 |
processed_results = [r[0] for r in results]
|
|
|
|
|
|
|
|
|
11 |
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
+
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
15 |
+
print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
16 |
|
17 |
# Now call the original process_results with the processed results
|
18 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
19 |
result_dict["end_to_end_time"] = end_to_end_time
|
20 |
result_dict["prefilling_time"] = prefilling_time
|
21 |
+
result_dict["decoding_throughput"] = decoding_throughput
|
22 |
return result_dict
|
23 |
return wrapper
|
24 |
|
|
|
29 |
aggregation_list = func(self, *args, **kwargs)
|
30 |
aggregation_list["end_to_end_time"] = mean
|
31 |
aggregation_list["prefilling_time"] = mean
|
32 |
+
aggregation_list["decoding_throughput"] = mean
|
33 |
return aggregation_list
|
34 |
return wrapper
|
35 |
|
|
|
40 |
higher_is_better_dict = func(self, *args, **kwargs)
|
41 |
higher_is_better_dict["end_to_end_time"] = False
|
42 |
higher_is_better_dict["prefilling_time"] = False
|
43 |
+
higher_is_better_dict["decoding_throughput"] = True
|
44 |
return higher_is_better_dict
|
45 |
return wrapper
|
46 |
|
src/display/utils.py
CHANGED
@@ -73,12 +73,12 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
73 |
# Inference framework
|
74 |
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("Inference framework", "str", True)])
|
75 |
|
76 |
-
# System performance metrics
|
77 |
-
auto_eval_column_dict.append(["prefilling_time", ColumnContent, ColumnContent("Prefilling time (s)", "number", True)])
|
78 |
-
auto_eval_column_dict.append(["token_per_second", ColumnContent, ColumnContent("Tokens/s", "number", True)])
|
79 |
-
|
80 |
for task in Tasks:
|
81 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Model information
|
84 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
73 |
# Inference framework
|
74 |
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("Inference framework", "str", True)])
|
75 |
|
|
|
|
|
|
|
|
|
76 |
for task in Tasks:
|
77 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
78 |
+
# System performance metrics
|
79 |
+
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} End-to-end time (s)", "number", True)])
|
80 |
+
auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} Prefilling time (s)", "number", True)])
|
81 |
+
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} Decoding throughput (tok/s)", "number", True)])
|
82 |
|
83 |
# Model information
|
84 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -107,7 +107,10 @@ class EvalResult:
|
|
107 |
multiplier = 1.0
|
108 |
if "squad" in benchmark:
|
109 |
multiplier = 1.0
|
110 |
-
|
|
|
|
|
|
|
111 |
# print('RESULTS', data['results'])
|
112 |
# print('XXX', benchmark, metric, value, multiplier)
|
113 |
results[benchmark][metric] = value * multiplier
|
|
|
107 |
multiplier = 1.0
|
108 |
if "squad" in benchmark:
|
109 |
multiplier = 1.0
|
110 |
+
if "time" in metric:
|
111 |
+
multiplier = 1.0
|
112 |
+
if "throughput" in metric:
|
113 |
+
multiplier = 1.0
|
114 |
# print('RESULTS', data['results'])
|
115 |
# print('XXX', benchmark, metric, value, multiplier)
|
116 |
results[benchmark][metric] = value * multiplier
|
src/populate.py
CHANGED
@@ -30,7 +30,8 @@ def get_leaderboard_df(
|
|
30 |
raw_data[result_idx], requests_path_open_llm
|
31 |
)
|
32 |
|
33 |
-
all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
|
|
34 |
|
35 |
name_to_bm_map = {}
|
36 |
|
@@ -45,15 +46,22 @@ def get_leaderboard_df(
|
|
45 |
name_to_bm_map[name] = bm
|
46 |
|
47 |
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
all_data_json = []
|
50 |
for entry in all_data_json_:
|
51 |
new_entry = copy.deepcopy(entry)
|
52 |
-
|
53 |
for k, v in entry.items():
|
54 |
if k in name_to_bm_map:
|
55 |
benchmark, metric = name_to_bm_map[k]
|
56 |
new_entry[k] = entry[k][metric]
|
|
|
|
|
|
|
57 |
|
58 |
all_data_json += [new_entry]
|
59 |
|
@@ -69,10 +77,10 @@ def get_leaderboard_df(
|
|
69 |
df[col] = np.nan
|
70 |
|
71 |
if not df.empty:
|
72 |
-
df = df
|
73 |
|
74 |
# filter out if any of the benchmarks have not been produced
|
75 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
76 |
|
77 |
return raw_data, df
|
78 |
|
|
|
30 |
raw_data[result_idx], requests_path_open_llm
|
31 |
)
|
32 |
|
33 |
+
# all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
34 |
+
all_data_json_ = [v.to_dict() for v in raw_data] # include incomplete evals
|
35 |
|
36 |
name_to_bm_map = {}
|
37 |
|
|
|
46 |
name_to_bm_map[name] = bm
|
47 |
|
48 |
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
49 |
+
system_metrics_to_name_map = {
|
50 |
+
"end_to_end_time": "End-to-end time (s)",
|
51 |
+
"prefilling_time": "Prefilling time (s)",
|
52 |
+
"decoding_throughput": "Decoding throughput (tok/s)",
|
53 |
+
}
|
54 |
|
55 |
all_data_json = []
|
56 |
for entry in all_data_json_:
|
57 |
new_entry = copy.deepcopy(entry)
|
|
|
58 |
for k, v in entry.items():
|
59 |
if k in name_to_bm_map:
|
60 |
benchmark, metric = name_to_bm_map[k]
|
61 |
new_entry[k] = entry[k][metric]
|
62 |
+
for sys_metric, metric_namne in system_metrics_to_name_map.items():
|
63 |
+
if sys_metric in entry[k]:
|
64 |
+
new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
|
65 |
|
66 |
all_data_json += [new_entry]
|
67 |
|
|
|
77 |
df[col] = np.nan
|
78 |
|
79 |
if not df.empty:
|
80 |
+
df = df.round(decimals=2)
|
81 |
|
82 |
# filter out if any of the benchmarks have not been produced
|
83 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
84 |
|
85 |
return raw_data, df
|
86 |
|