glenn-jocher
commited on
Commit
•
e8810a5
1
Parent(s):
fbf41e0
Update DDP backend `if dist.is_nccl_available()` (#3705)
Browse files
train.py
CHANGED
@@ -539,7 +539,7 @@ def main(opt):
|
|
539 |
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
|
540 |
torch.cuda.set_device(LOCAL_RANK)
|
541 |
device = torch.device('cuda', LOCAL_RANK)
|
542 |
-
dist.init_process_group(backend="gloo", timeout=timedelta(seconds=60))
|
543 |
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
|
544 |
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
|
545 |
|
|
|
539 |
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
|
540 |
torch.cuda.set_device(LOCAL_RANK)
|
541 |
device = torch.device('cuda', LOCAL_RANK)
|
542 |
+
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
|
543 |
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
|
544 |
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
|
545 |
|