Spaces:

Do0rMaMu
/

Factory-POC

Sleeping

Upload folder using huggingface_hub

e45d058 verified 3 months ago

1.53 kB

	import torch

	from pytorch_lightning import Callback, Trainer, LightningModule

	import logging

	log = logging.getLogger(__name__) # We want a logger for each process, not just the rank 0


	def l2_promote():
	import ctypes
	_libcudart = ctypes.CDLL('libcudart.so')
	# Set device limit on the current device
	# cudaLimitMaxL2FetchGranularity = 0x05
	pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int))
	_libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
	_libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
	assert pValue.contents.value == 128


	def set_affinity(trainer):
	try:
	from src.utils.gpu_affinity import set_affinity
	nproc_per_node = torch.cuda.device_count()
	affinity = set_affinity(trainer.local_rank, nproc_per_node, 'socket_unique_continuous')
	log.info(f'{trainer.local_rank}: thread affinity: {affinity}')
	# TD [2022-05-07] Somehow calling this causes GPU 0 to allocate extra ~800MB of memory per
	# number of GPUs (e.g., 6.4GB of extra memory in a 8-GPU setup). H/t Dan.
	# l2_promote()
	except:
	pass


	class GpuAffinity(Callback):
	"""Set GPU affinity and increase the L2 fetch granularity.
	Adapted from https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL
	"""

	def setup(self, trainer: Trainer, pl_module: LightningModule, stage=None) -> None:
	set_affinity(trainer)