glenn-jocher
commited on
Commit
•
379396e
1
Parent(s):
c4addd7
Yaml constructor posixpath --resume bug fix (#1390)
Browse files* resume fix for yaml constructor posixpath error
* fix update
* remove weights/ dir backup
train.py
CHANGED
@@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
|
|
37 |
def train(hyp, opt, device, tb_writer=None, wandb=None):
|
38 |
logger.info(f'Hyperparameters {hyp}')
|
39 |
save_dir, epochs, batch_size, total_batch_size, weights, rank = \
|
40 |
-
opt.save_dir, opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
|
41 |
|
42 |
# Directories
|
43 |
wdir = save_dir / 'weights'
|
@@ -143,7 +143,6 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
|
|
143 |
start_epoch = ckpt['epoch'] + 1
|
144 |
if opt.resume:
|
145 |
assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
|
146 |
-
shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}') # save previous weights
|
147 |
if epochs < start_epoch:
|
148 |
logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
|
149 |
(weights, ckpt['epoch'], epochs))
|
@@ -431,9 +430,8 @@ if __name__ == '__main__':
|
|
431 |
# Resume
|
432 |
if opt.resume: # resume an interrupted run
|
433 |
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
|
434 |
-
opt.save_dir = Path(ckpt).parent.parent # runs/train/exp
|
435 |
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
|
436 |
-
with open(
|
437 |
opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
|
438 |
opt.cfg, opt.weights, opt.resume = '', ckpt, True
|
439 |
logger.info('Resuming training from %s' % ckpt)
|
@@ -443,7 +441,7 @@ if __name__ == '__main__':
|
|
443 |
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
|
444 |
opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
|
445 |
opt.name = 'evolve' if opt.evolve else opt.name
|
446 |
-
opt.save_dir =
|
447 |
|
448 |
# DDP mode
|
449 |
device = select_device(opt.device, batch_size=opt.batch_size)
|
@@ -517,7 +515,7 @@ if __name__ == '__main__':
|
|
517 |
assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
|
518 |
opt.notest, opt.nosave = True, True # only test/save final epoch
|
519 |
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
|
520 |
-
yaml_file = opt.save_dir / 'hyp_evolved.yaml' # save best result here
|
521 |
if opt.bucket:
|
522 |
os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
|
523 |
|
|
|
37 |
def train(hyp, opt, device, tb_writer=None, wandb=None):
|
38 |
logger.info(f'Hyperparameters {hyp}')
|
39 |
save_dir, epochs, batch_size, total_batch_size, weights, rank = \
|
40 |
+
Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
|
41 |
|
42 |
# Directories
|
43 |
wdir = save_dir / 'weights'
|
|
|
143 |
start_epoch = ckpt['epoch'] + 1
|
144 |
if opt.resume:
|
145 |
assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
|
|
|
146 |
if epochs < start_epoch:
|
147 |
logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
|
148 |
(weights, ckpt['epoch'], epochs))
|
|
|
430 |
# Resume
|
431 |
if opt.resume: # resume an interrupted run
|
432 |
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
|
|
|
433 |
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
|
434 |
+
with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
|
435 |
opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace
|
436 |
opt.cfg, opt.weights, opt.resume = '', ckpt, True
|
437 |
logger.info('Resuming training from %s' % ckpt)
|
|
|
441 |
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
|
442 |
opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
|
443 |
opt.name = 'evolve' if opt.evolve else opt.name
|
444 |
+
opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok) # increment run
|
445 |
|
446 |
# DDP mode
|
447 |
device = select_device(opt.device, batch_size=opt.batch_size)
|
|
|
515 |
assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
|
516 |
opt.notest, opt.nosave = True, True # only test/save final epoch
|
517 |
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
|
518 |
+
yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here
|
519 |
if opt.bucket:
|
520 |
os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
|
521 |
|