glenn-jocher
commited on
Commit
•
50a9828
1
Parent(s):
bb5ebc2
DDP `torch.jit.trace()` `--sync-bn` fix (#4615)
Browse files* Remove assert
* debug0
* trace=not opt.sync
* sync to sync_bn fix
* Cleanup
- train.py +1 -2
- utils/loggers/__init__.py +5 -4
train.py
CHANGED
@@ -333,7 +333,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
333 |
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
|
334 |
pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
|
335 |
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
|
336 |
-
callbacks.on_train_batch_end(ni, model, imgs, targets, paths, plots)
|
337 |
# end batch ------------------------------------------------------------------------------------------------
|
338 |
|
339 |
# Scheduler
|
@@ -499,7 +499,6 @@ def main(opt):
|
|
499 |
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
|
500 |
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
|
501 |
assert not opt.evolve, '--evolve argument is not compatible with DDP training'
|
502 |
-
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
|
503 |
torch.cuda.set_device(LOCAL_RANK)
|
504 |
device = torch.device('cuda', LOCAL_RANK)
|
505 |
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
|
|
|
333 |
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
|
334 |
pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
|
335 |
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
|
336 |
+
callbacks.on_train_batch_end(ni, model, imgs, targets, paths, plots, opt.sync_bn)
|
337 |
# end batch ------------------------------------------------------------------------------------------------
|
338 |
|
339 |
# Scheduler
|
|
|
499 |
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
|
500 |
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
|
501 |
assert not opt.evolve, '--evolve argument is not compatible with DDP training'
|
|
|
502 |
torch.cuda.set_device(LOCAL_RANK)
|
503 |
device = torch.device('cuda', LOCAL_RANK)
|
504 |
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
|
utils/loggers/__init__.py
CHANGED
@@ -69,13 +69,14 @@ class Loggers():
|
|
69 |
if self.wandb:
|
70 |
self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]})
|
71 |
|
72 |
-
def on_train_batch_end(self, ni, model, imgs, targets, paths, plots):
|
73 |
# Callback runs on train batch end
|
74 |
if plots:
|
75 |
if ni == 0:
|
76 |
-
|
77 |
-
warnings.
|
78 |
-
|
|
|
79 |
if ni < 3:
|
80 |
f = self.save_dir / f'train_batch{ni}.jpg' # filename
|
81 |
Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
|
|
|
69 |
if self.wandb:
|
70 |
self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]})
|
71 |
|
72 |
+
def on_train_batch_end(self, ni, model, imgs, targets, paths, plots, sync_bn):
|
73 |
# Callback runs on train batch end
|
74 |
if plots:
|
75 |
if ni == 0:
|
76 |
+
if not sync_bn: # tb.add_graph() --sync known issue https://github.com/ultralytics/yolov5/issues/3754
|
77 |
+
with warnings.catch_warnings():
|
78 |
+
warnings.simplefilter('ignore') # suppress jit trace warning
|
79 |
+
self.tb.add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), [])
|
80 |
if ni < 3:
|
81 |
f = self.save_dir / f'train_batch{ni}.jpg' # filename
|
82 |
Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
|