sync from github
Browse files- src/backend/hflm_with_measurement.py +17 -4
- src/utils.py +0 -37
src/backend/hflm_with_measurement.py
CHANGED
@@ -37,7 +37,7 @@ from lm_eval.models.utils import (
|
|
37 |
stop_sequences_criteria,
|
38 |
)
|
39 |
from lm_eval.models.huggingface import HFLM
|
40 |
-
from src.utils import
|
41 |
from src.submission.check_validity import get_model_size
|
42 |
from src.envs import API
|
43 |
|
@@ -73,6 +73,18 @@ class HFLMWithMeasurement(HFLM):
|
|
73 |
self.pretrained = kwargs.get("pretrained", None)
|
74 |
self.revision = kwargs.get("revision", None)
|
75 |
self.precision = kwargs.get("dtype", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
def _loglikelihood_tokens(
|
78 |
self,
|
@@ -352,7 +364,8 @@ class HFLMWithMeasurement(HFLM):
|
|
352 |
else:
|
353 |
continue
|
354 |
print(f"linear_count: {linear_count}")
|
355 |
-
print(f"element_wise_mul: {element_wise_mul}")
|
|
|
356 |
|
357 |
stopping_criteria = stop_sequences_criteria(
|
358 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
@@ -423,7 +436,7 @@ class HFLMWithMeasurement(HFLM):
|
|
423 |
per_token_kv_size = 2 * n_layers * d_model * precision_bytes
|
424 |
|
425 |
peak_bw_single = get_peak_bw(get_gpu_details())
|
426 |
-
peak_bw = peak_bw_single *
|
427 |
|
428 |
context_prefill_size = context_length
|
429 |
kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
|
@@ -441,7 +454,7 @@ class HFLMWithMeasurement(HFLM):
|
|
441 |
avg_context_length = context_length + (output_length - 1) / 2
|
442 |
flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
|
443 |
peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
|
444 |
-
peak_flops = peak_flops_single *
|
445 |
|
446 |
## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
|
447 |
mfu = token_per_sec * flops_per_token / peak_flops
|
|
|
37 |
stop_sequences_criteria,
|
38 |
)
|
39 |
from lm_eval.models.huggingface import HFLM
|
40 |
+
from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
|
41 |
from src.submission.check_validity import get_model_size
|
42 |
from src.envs import API
|
43 |
|
|
|
73 |
self.pretrained = kwargs.get("pretrained", None)
|
74 |
self.revision = kwargs.get("revision", None)
|
75 |
self.precision = kwargs.get("dtype", None)
|
76 |
+
self.num_gpus = None
|
77 |
+
|
78 |
+
def _detect_num_gpus_used(self):
|
79 |
+
if self.num_gpus is not None:
|
80 |
+
return self.num_gpus
|
81 |
+
gpus = []
|
82 |
+
for p in self.model.parameters():
|
83 |
+
if p.device.type == "cuda":
|
84 |
+
gpus.append(p.device.index)
|
85 |
+
|
86 |
+
self.num_gpus = len(set(gpus))
|
87 |
+
return self.num_gpus
|
88 |
|
89 |
def _loglikelihood_tokens(
|
90 |
self,
|
|
|
364 |
else:
|
365 |
continue
|
366 |
print(f"linear_count: {linear_count}")
|
367 |
+
print(f"element_wise_mul: {element_wise_mul}")
|
368 |
+
print(f"GPU usage: {self._detect_num_gpus_used()}")
|
369 |
|
370 |
stopping_criteria = stop_sequences_criteria(
|
371 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
|
|
436 |
per_token_kv_size = 2 * n_layers * d_model * precision_bytes
|
437 |
|
438 |
peak_bw_single = get_peak_bw(get_gpu_details())
|
439 |
+
peak_bw = peak_bw_single * self._detect_num_gpus_used()
|
440 |
|
441 |
context_prefill_size = context_length
|
442 |
kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
|
|
|
454 |
avg_context_length = context_length + (output_length - 1) / 2
|
455 |
flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
|
456 |
peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
|
457 |
+
peak_flops = peak_flops_single * self._detect_num_gpus_used()
|
458 |
|
459 |
## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
|
460 |
mfu = token_per_sec * flops_per_token / peak_flops
|
src/utils.py
CHANGED
@@ -174,43 +174,6 @@ def analyze_gpu_stats(stats_list):
|
|
174 |
|
175 |
return avg_stats
|
176 |
|
177 |
-
def get_gpu_number():
|
178 |
-
visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
|
179 |
-
if visible_devices is not None:
|
180 |
-
gpu_indices = visible_devices.split(',')
|
181 |
-
else:
|
182 |
-
# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
|
183 |
-
result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
|
184 |
-
if result.returncode != 0:
|
185 |
-
print("Failed to query GPU indices.")
|
186 |
-
return []
|
187 |
-
gpu_indices = result.stdout.strip().split('\n')
|
188 |
-
# print(f"gpu_indices: {gpu_indices}")
|
189 |
-
gpu_stats = []
|
190 |
-
|
191 |
-
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
192 |
-
|
193 |
-
for index in gpu_indices:
|
194 |
-
result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
|
195 |
-
output = result.stdout.strip()
|
196 |
-
lines = output.split("\n")
|
197 |
-
for line in lines:
|
198 |
-
match = gpu_info_pattern.search(line)
|
199 |
-
gpu_info = {}
|
200 |
-
if match:
|
201 |
-
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
202 |
-
gpu_info.update({
|
203 |
-
GPU_TEMP: temp,
|
204 |
-
GPU_Power: power_usage,
|
205 |
-
GPU_Mem: round(mem_usage / 1024, 2),
|
206 |
-
GPU_Util: gpu_util
|
207 |
-
})
|
208 |
-
|
209 |
-
if len(gpu_info) >= 4:
|
210 |
-
gpu_stats.append(gpu_info)
|
211 |
-
|
212 |
-
return len(gpu_stats)
|
213 |
-
|
214 |
def get_gpu_details():
|
215 |
gpus = GPUtil.getGPUs()
|
216 |
gpu = gpus[0]
|
|
|
174 |
|
175 |
return avg_stats
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
def get_gpu_details():
|
178 |
gpus = GPUtil.getGPUs()
|
179 |
gpu = gpus[0]
|