fix the wrong GPU index issue of multi-node

Files changed (2) hide show

configs/metadata.json CHANGED Viewed

@@ -1,14 +1,15 @@
 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
-    "version": "1.0.4",
     "changelog": {
         "1.0.4": "update with new lr scheduler api",
         "1.0.3": "update required packages",
         "1.0.2": "remove unused saver in inference",
         "1.0.1": "fix inference folder error",
         "1.0.0": "Initial release"
     },
-    "monai_version": "1.2.0rc7",
     "pytorch_version": "1.13.1",
     "numpy_version": "1.22.2",
     "optional_packages_version": {

 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
+    "version": "1.0.5",
     "changelog": {
+        "1.0.5": "fix the wrong GPU index issue of multi-node",
         "1.0.4": "update with new lr scheduler api",
         "1.0.3": "update required packages",
         "1.0.2": "remove unused saver in inference",
         "1.0.1": "fix inference folder error",
         "1.0.0": "Initial release"
     },
+    "monai_version": "1.2.0",
     "pytorch_version": "1.13.1",
     "numpy_version": "1.22.2",
     "optional_packages_version": {

configs/multi_gpu_train_autoencoder.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "device": "$torch.device(f'cuda:{dist.get_rank()}')",
     "gnetwork": {
         "_target_": "torch.nn.parallel.DistributedDataParallel",
         "module": "$@autoencoder_def.to(@device)",
@@ -27,6 +27,7 @@
     "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
     "initialize": [
         "$import torch.distributed as dist",
         "$dist.is_initialized() or dist.init_process_group(backend='nccl')",
         "$torch.cuda.set_device(@device)",
         "$monai.utils.set_determinism(seed=123)",

 {
+    "device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
     "gnetwork": {
         "_target_": "torch.nn.parallel.DistributedDataParallel",
         "module": "$@autoencoder_def.to(@device)",
     "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
     "initialize": [
         "$import torch.distributed as dist",
+        "$import os",
         "$dist.is_initialized() or dist.init_process_group(backend='nccl')",
         "$torch.cuda.set_device(@device)",
         "$monai.utils.set_determinism(seed=123)",