Spaces:
Sleeping
Sleeping
import torch | |
from pytorch_lightning import Callback, Trainer, LightningModule | |
import logging | |
log = logging.getLogger(__name__) # We want a logger for each process, not just the rank 0 | |
def l2_promote(): | |
import ctypes | |
_libcudart = ctypes.CDLL('libcudart.so') | |
# Set device limit on the current device | |
# cudaLimitMaxL2FetchGranularity = 0x05 | |
pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int)) | |
_libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) | |
_libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) | |
assert pValue.contents.value == 128 | |
def set_affinity(trainer): | |
try: | |
from src.utils.gpu_affinity import set_affinity | |
nproc_per_node = torch.cuda.device_count() | |
affinity = set_affinity(trainer.local_rank, nproc_per_node, 'socket_unique_continuous') | |
log.info(f'{trainer.local_rank}: thread affinity: {affinity}') | |
# TD [2022-05-07] Somehow calling this causes GPU 0 to allocate extra ~800MB of memory per | |
# number of GPUs (e.g., 6.4GB of extra memory in a 8-GPU setup). H/t Dan. | |
# l2_promote() | |
except: | |
pass | |
class GpuAffinity(Callback): | |
"""Set GPU affinity and increase the L2 fetch granularity. | |
Adapted from https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL | |
""" | |
def setup(self, trainer: Trainer, pl_module: LightningModule, stage=None) -> None: | |
set_affinity(trainer) | |