|
|
|
import os |
|
import random |
|
|
|
import numpy as np |
|
import torch |
|
import torch.distributed as dist |
|
from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, |
|
Fp16OptimizerHook, OptimizerHook, build_runner, |
|
get_dist_info) |
|
|
|
from mmdet.core import DistEvalHook, EvalHook, build_optimizer |
|
from mmdet.datasets import (build_dataloader, build_dataset, |
|
replace_ImageToTensor) |
|
from mmdet.utils import (build_ddp, build_dp, compat_cfg, |
|
find_latest_checkpoint, get_root_logger) |
|
|
|
|
|
def init_random_seed(seed=None, device='cuda'): |
|
"""Initialize random seed. |
|
|
|
If the seed is not set, the seed will be automatically randomized, |
|
and then broadcast to all processes to prevent some potential bugs. |
|
|
|
Args: |
|
seed (int, Optional): The seed. Default to None. |
|
device (str): The device where the seed will be put on. |
|
Default to 'cuda'. |
|
|
|
Returns: |
|
int: Seed to be used. |
|
""" |
|
if seed is not None: |
|
return seed |
|
|
|
|
|
|
|
|
|
rank, world_size = get_dist_info() |
|
seed = np.random.randint(2**31) |
|
if world_size == 1: |
|
return seed |
|
|
|
if rank == 0: |
|
random_num = torch.tensor(seed, dtype=torch.int32, device=device) |
|
else: |
|
random_num = torch.tensor(0, dtype=torch.int32, device=device) |
|
dist.broadcast(random_num, src=0) |
|
return random_num.item() |
|
|
|
|
|
def set_random_seed(seed, deterministic=False): |
|
"""Set random seed. |
|
|
|
Args: |
|
seed (int): Seed to be used. |
|
deterministic (bool): Whether to set the deterministic option for |
|
CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` |
|
to True and `torch.backends.cudnn.benchmark` to False. |
|
Default: False. |
|
""" |
|
random.seed(seed) |
|
np.random.seed(seed) |
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed_all(seed) |
|
if deterministic: |
|
torch.backends.cudnn.deterministic = True |
|
torch.backends.cudnn.benchmark = False |
|
|
|
|
|
def auto_scale_lr(cfg, distributed, logger): |
|
"""Automatically scaling LR according to GPU number and sample per GPU. |
|
|
|
Args: |
|
cfg (config): Training config. |
|
distributed (bool): Using distributed or not. |
|
logger (logging.Logger): Logger. |
|
""" |
|
|
|
if ('auto_scale_lr' not in cfg) or \ |
|
(not cfg.auto_scale_lr.get('enable', False)): |
|
logger.info('Automatic scaling of learning rate (LR)' |
|
' has been disabled.') |
|
return |
|
|
|
|
|
base_batch_size = cfg.auto_scale_lr.get('base_batch_size', None) |
|
if base_batch_size is None: |
|
return |
|
|
|
|
|
if distributed: |
|
_, world_size = get_dist_info() |
|
num_gpus = len(range(world_size)) |
|
else: |
|
num_gpus = len(cfg.gpu_ids) |
|
|
|
|
|
samples_per_gpu = cfg.data.train_dataloader.samples_per_gpu |
|
batch_size = num_gpus * samples_per_gpu |
|
logger.info(f'Training with {num_gpus} GPU(s) with {samples_per_gpu} ' |
|
f'samples per GPU. The total batch size is {batch_size}.') |
|
|
|
if batch_size != base_batch_size: |
|
|
|
|
|
scaled_lr = (batch_size / base_batch_size) * cfg.optimizer.lr |
|
logger.info('LR has been automatically scaled ' |
|
f'from {cfg.optimizer.lr} to {scaled_lr}') |
|
cfg.optimizer.lr = scaled_lr |
|
else: |
|
logger.info('The batch size match the ' |
|
f'base batch size: {base_batch_size}, ' |
|
f'will not scaling the LR ({cfg.optimizer.lr}).') |
|
|
|
|
|
def train_detector(model, |
|
dataset, |
|
cfg, |
|
distributed=False, |
|
validate=False, |
|
timestamp=None, |
|
meta=None): |
|
|
|
cfg = compat_cfg(cfg) |
|
logger = get_root_logger(log_level=cfg.log_level) |
|
|
|
|
|
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] |
|
|
|
runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[ |
|
'type'] |
|
|
|
train_dataloader_default_args = dict( |
|
samples_per_gpu=2, |
|
workers_per_gpu=2, |
|
|
|
num_gpus=len(cfg.gpu_ids), |
|
dist=distributed, |
|
seed=cfg.seed, |
|
runner_type=runner_type, |
|
persistent_workers=False) |
|
|
|
train_loader_cfg = { |
|
**train_dataloader_default_args, |
|
**cfg.data.get('train_dataloader', {}) |
|
} |
|
|
|
data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] |
|
|
|
|
|
if distributed: |
|
find_unused_parameters = cfg.get('find_unused_parameters', False) |
|
|
|
|
|
model = build_ddp( |
|
model, |
|
cfg.device, |
|
device_ids=[int(os.environ['LOCAL_RANK'])], |
|
broadcast_buffers=False, |
|
find_unused_parameters=find_unused_parameters) |
|
else: |
|
model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) |
|
|
|
|
|
auto_scale_lr(cfg, distributed, logger) |
|
optimizer = build_optimizer(model, cfg.optimizer) |
|
|
|
runner = build_runner( |
|
cfg.runner, |
|
default_args=dict( |
|
model=model, |
|
optimizer=optimizer, |
|
work_dir=cfg.work_dir, |
|
logger=logger, |
|
meta=meta)) |
|
|
|
|
|
runner.timestamp = timestamp |
|
|
|
|
|
fp16_cfg = cfg.get('fp16', None) |
|
if fp16_cfg is None and cfg.get('device', None) == 'npu': |
|
fp16_cfg = dict(loss_scale='dynamic') |
|
if fp16_cfg is not None: |
|
optimizer_config = Fp16OptimizerHook( |
|
**cfg.optimizer_config, **fp16_cfg, distributed=distributed) |
|
elif distributed and 'type' not in cfg.optimizer_config: |
|
optimizer_config = OptimizerHook(**cfg.optimizer_config) |
|
else: |
|
optimizer_config = cfg.optimizer_config |
|
|
|
|
|
runner.register_training_hooks( |
|
cfg.lr_config, |
|
optimizer_config, |
|
cfg.checkpoint_config, |
|
cfg.log_config, |
|
cfg.get('momentum_config', None), |
|
custom_hooks_config=cfg.get('custom_hooks', None)) |
|
|
|
if distributed: |
|
if isinstance(runner, EpochBasedRunner): |
|
runner.register_hook(DistSamplerSeedHook()) |
|
|
|
|
|
if validate: |
|
val_dataloader_default_args = dict( |
|
samples_per_gpu=1, |
|
workers_per_gpu=2, |
|
dist=distributed, |
|
shuffle=False, |
|
persistent_workers=False) |
|
|
|
val_dataloader_args = { |
|
**val_dataloader_default_args, |
|
**cfg.data.get('val_dataloader', {}) |
|
} |
|
|
|
|
|
if val_dataloader_args['samples_per_gpu'] > 1: |
|
|
|
cfg.data.val.pipeline = replace_ImageToTensor( |
|
cfg.data.val.pipeline) |
|
val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) |
|
|
|
val_dataloader = build_dataloader(val_dataset, **val_dataloader_args) |
|
eval_cfg = cfg.get('evaluation', {}) |
|
eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' |
|
eval_hook = DistEvalHook if distributed else EvalHook |
|
|
|
|
|
runner.register_hook( |
|
eval_hook(val_dataloader, **eval_cfg), priority='LOW') |
|
|
|
resume_from = None |
|
if cfg.resume_from is None and cfg.get('auto_resume'): |
|
resume_from = find_latest_checkpoint(cfg.work_dir) |
|
if resume_from is not None: |
|
cfg.resume_from = resume_from |
|
|
|
if cfg.resume_from: |
|
runner.resume(cfg.resume_from) |
|
elif cfg.load_from: |
|
runner.load_checkpoint(cfg.load_from) |
|
runner.run(data_loaders, cfg.workflow) |
|
|