# Copyright (c) OpenMMLab. All rights reserved. import os import random import numpy as np import torch import torch.distributed as dist from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_runner, get_dist_info) from mmdet.core import DistEvalHook, EvalHook, build_optimizer from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.utils import (build_ddp, build_dp, compat_cfg, find_latest_checkpoint, get_root_logger) def init_random_seed(seed=None, device='cuda'): """Initialize random seed. If the seed is not set, the seed will be automatically randomized, and then broadcast to all processes to prevent some potential bugs. Args: seed (int, Optional): The seed. Default to None. device (str): The device where the seed will be put on. Default to 'cuda'. Returns: int: Seed to be used. """ if seed is not None: return seed # Make sure all ranks share the same random seed to prevent # some potential bugs. Please refer to # https://github.com/open-mmlab/mmdetection/issues/6339 rank, world_size = get_dist_info() seed = np.random.randint(2**31) if world_size == 1: return seed if rank == 0: random_num = torch.tensor(seed, dtype=torch.int32, device=device) else: random_num = torch.tensor(0, dtype=torch.int32, device=device) dist.broadcast(random_num, src=0) return random_num.item() def set_random_seed(seed, deterministic=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def auto_scale_lr(cfg, distributed, logger): """Automatically scaling LR according to GPU number and sample per GPU. Args: cfg (config): Training config. distributed (bool): Using distributed or not. logger (logging.Logger): Logger. """ # Get flag from config if ('auto_scale_lr' not in cfg) or \ (not cfg.auto_scale_lr.get('enable', False)): logger.info('Automatic scaling of learning rate (LR)' ' has been disabled.') return # Get base batch size from config base_batch_size = cfg.auto_scale_lr.get('base_batch_size', None) if base_batch_size is None: return # Get gpu number if distributed: _, world_size = get_dist_info() num_gpus = len(range(world_size)) else: num_gpus = len(cfg.gpu_ids) # calculate the batch size samples_per_gpu = cfg.data.train_dataloader.samples_per_gpu batch_size = num_gpus * samples_per_gpu logger.info(f'Training with {num_gpus} GPU(s) with {samples_per_gpu} ' f'samples per GPU. The total batch size is {batch_size}.') if batch_size != base_batch_size: # scale LR with # [linear scaling rule](https://arxiv.org/abs/1706.02677) scaled_lr = (batch_size / base_batch_size) * cfg.optimizer.lr logger.info('LR has been automatically scaled ' f'from {cfg.optimizer.lr} to {scaled_lr}') cfg.optimizer.lr = scaled_lr else: logger.info('The batch size match the ' f'base batch size: {base_batch_size}, ' f'will not scaling the LR ({cfg.optimizer.lr}).') def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): cfg = compat_cfg(cfg) logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[ 'type'] train_dataloader_default_args = dict( samples_per_gpu=2, workers_per_gpu=2, # `num_gpus` will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, runner_type=runner_type, persistent_workers=False) train_loader_cfg = { **train_dataloader_default_args, **cfg.data.get('train_dataloader', {}) } data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = build_ddp( model, cfg.device, device_ids=[int(os.environ['LOCAL_RANK'])], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) # build optimizer auto_scale_lr(cfg, distributed, logger) optimizer = build_optimizer(model, cfg.optimizer) runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is None and cfg.get('device', None) == 'npu': fp16_cfg = dict(loss_scale='dynamic') if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks( cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get('custom_hooks', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataloader_default_args = dict( samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False, persistent_workers=False) val_dataloader_args = { **val_dataloader_default_args, **cfg.data.get('val_dataloader', {}) } # Support batch_size > 1 in validation if val_dataloader_args['samples_per_gpu'] > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader(val_dataset, **val_dataloader_args) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. runner.register_hook( eval_hook(val_dataloader, **eval_cfg), priority='LOW') resume_from = None if cfg.resume_from is None and cfg.get('auto_resume'): resume_from = find_latest_checkpoint(cfg.work_dir) if resume_from is not None: cfg.resume_from = resume_from if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)