fix the wrong GPU index issue of multi-node
Browse files
configs/metadata.json
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
{
|
2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
|
3 |
-
"version": "1.0.
|
4 |
"changelog": {
|
|
|
5 |
"1.0.4": "update with new lr scheduler api",
|
6 |
"1.0.3": "update required packages",
|
7 |
"1.0.2": "remove unused saver in inference",
|
8 |
"1.0.1": "fix inference folder error",
|
9 |
"1.0.0": "Initial release"
|
10 |
},
|
11 |
-
"monai_version": "1.2.
|
12 |
"pytorch_version": "1.13.1",
|
13 |
"numpy_version": "1.22.2",
|
14 |
"optional_packages_version": {
|
|
|
1 |
{
|
2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
|
3 |
+
"version": "1.0.5",
|
4 |
"changelog": {
|
5 |
+
"1.0.5": "fix the wrong GPU index issue of multi-node",
|
6 |
"1.0.4": "update with new lr scheduler api",
|
7 |
"1.0.3": "update required packages",
|
8 |
"1.0.2": "remove unused saver in inference",
|
9 |
"1.0.1": "fix inference folder error",
|
10 |
"1.0.0": "Initial release"
|
11 |
},
|
12 |
+
"monai_version": "1.2.0",
|
13 |
"pytorch_version": "1.13.1",
|
14 |
"numpy_version": "1.22.2",
|
15 |
"optional_packages_version": {
|
configs/multi_gpu_train_autoencoder.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"device": "$torch.device(
|
3 |
"gnetwork": {
|
4 |
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
5 |
"module": "$@autoencoder_def.to(@device)",
|
@@ -27,6 +27,7 @@
|
|
27 |
"train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
|
28 |
"initialize": [
|
29 |
"$import torch.distributed as dist",
|
|
|
30 |
"$dist.is_initialized() or dist.init_process_group(backend='nccl')",
|
31 |
"$torch.cuda.set_device(@device)",
|
32 |
"$monai.utils.set_determinism(seed=123)",
|
|
|
1 |
{
|
2 |
+
"device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
|
3 |
"gnetwork": {
|
4 |
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
5 |
"module": "$@autoencoder_def.to(@device)",
|
|
|
27 |
"train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
|
28 |
"initialize": [
|
29 |
"$import torch.distributed as dist",
|
30 |
+
"$import os",
|
31 |
"$dist.is_initialized() or dist.init_process_group(backend='nccl')",
|
32 |
"$torch.cuda.set_device(@device)",
|
33 |
"$monai.utils.set_determinism(seed=123)",
|