LarryTsai
/

RevCol

Model card Files Files and versions Community

LarryTsai commited on Mar 9, 2023

Commit

b9425fd

•

1 Parent(s): f74a553

Training Code:cls/det

Browse files

Files changed (50) hide show

training/.gitignore +250 -0
training/Detection/README.md +47 -0
training/Detection/configs/_base_/models/cascade_mask_rcnn_revcol_fpn.py +209 -0
training/Detection/configs/revcol/cascade_mask_rcnn_revcol_base_3x_in1k.py +152 -0
training/Detection/configs/revcol/cascade_mask_rcnn_revcol_base_3x_in22k.py +152 -0
training/Detection/configs/revcol/cascade_mask_rcnn_revcol_large_3x_in22k.py +152 -0
training/Detection/configs/revcol/cascade_mask_rcnn_revcol_small_3x_in1k.py +152 -0
training/Detection/configs/revcol/cascade_mask_rcnn_revcol_tiny_3x_in1k.py +152 -0
training/Detection/mmcv_custom/__init__.py +15 -0
training/Detection/mmcv_custom/checkpoint.py +484 -0
training/Detection/mmcv_custom/customized_text.py +130 -0
training/Detection/mmcv_custom/layer_decay_optimizer_constructor.py +121 -0
training/Detection/mmcv_custom/runner/checkpoint.py +85 -0
training/Detection/mmdet/models/backbones/__init__.py +28 -0
training/Detection/mmdet/models/backbones/revcol.py +187 -0
training/Detection/mmdet/models/backbones/revcol_function.py +222 -0
training/Detection/mmdet/models/backbones/revcol_module.py +85 -0
training/Detection/mmdet/utils/__init__.py +12 -0
training/Detection/mmdet/utils/optimizer.py +33 -0
training/INSTRUCTIONS.md +158 -0
training/LICENSE +190 -0
training/README.md +79 -0
training/config.py +243 -0
training/configs/revcol_base_1k.yaml +48 -0
training/configs/revcol_base_1k_224_finetune.yaml +50 -0
training/configs/revcol_base_1k_384_finetune.yaml +50 -0
training/configs/revcol_base_22k_pretrain.yaml +51 -0
training/configs/revcol_large_1k_224_finetune.yaml +51 -0
training/configs/revcol_large_1k_384_finetune.yaml +51 -0
training/configs/revcol_large_22k_pretrain.yaml +50 -0
training/configs/revcol_small_1k.yaml +48 -0
training/configs/revcol_tiny_1k.yaml +48 -0
training/configs/revcol_xlarge_1k_384_finetune.yaml +53 -0
training/configs/revcol_xlarge_22k_pretrain.yaml +50 -0
training/data/__init__.py +1 -0
training/data/build_data.py +137 -0
training/data/samplers.py +29 -0
training/figures/title.png +0 -0
training/logger.py +41 -0
training/loss.py +35 -0
training/lr_scheduler.py +96 -0
training/main.py +422 -0
training/models/__init__.py +1 -0
training/models/build.py +48 -0
training/models/modules.py +157 -0
training/models/revcol.py +242 -0
training/models/revcol_function.py +159 -0
training/optimizer.py +145 -0
training/requirements.txt +7 -0
training/utils.py +179 -0

training/.gitignore ADDED Viewed

	@@ -0,0 +1,250 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/en/_build/
+docs/zh_cn/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+data/
+data
+.vscode
+.idea
+.DS_Store
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+docs/modelzoo_statistics.md
+mmdet/.mim
+work_dirs/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/en/_build/
+docs/zh_cn/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+data/
+data
+.vscode
+.idea
+.DS_Store
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+docs/modelzoo_statistics.md
+mmdet/.mim
+work_dirs/
+.DS_Store
+# Pytorch
+*.pth
+*.py~
+*.sh~
+.DS_
+# Pytorch
+*.pth
+*.py~
+*.sh~

training/Detection/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# COCO Object detection with RevCol
+## Getting started
+We build RevCol object detection model based on [mmdetection](https://github.com/open-mmlab/mmdetection/tree/3e2693151add9b5d6db99b944da020cba837266b) commit `3e26931`. We add RevCol model and config files to [the original repo](https://github.com/open-mmlab/mmdetection/tree/3e2693151add9b5d6db99b944da020cba837266b). Please refer to [get_started.md](https://github.com/open-mmlab/mmdetection/blob/3e2693151add9b5d6db99b944da020cba837266b/docs/en/get_started.md) for installation and dataset preparation instructions.
+## Results and Fine-tuned Models
+| name | Pretrained Model | Method | Lr Schd | box mAP | mask mAP | #params | FLOPs | Fine-tuned Model |
+|:---:|:---:|:---:|:---:| :---:|:---:|:---:|:---:| :---:|
+| RevCol-T | [ImageNet-1K]() | Cascade Mask R-CNN | 3x | 50.6 | 43.8 | 88M | 741G | [model]() |
+| RevCol-S | [ImageNet-1K]() | Cascade Mask R-CNN | 3x | 52.6 | 45.5 | 118M | 833G | [model]() |
+| RevCol-B | [ImageNet-1K]() | Cascade Mask R-CNN | 3x | 53.0 | 45.9 | 196M | 988G | [model]() |
+| RevCol-B | [ImageNet-22K]() | Cascade Mask R-CNN | 3x | 55.0 | 47.5 | 196M | 988G | [model]() |
+| RevCol-L | [ImageNet-22K]() | Cascade Mask R-CNN | 3x | 55.9 | 48.4 | 330M | 1453G | [model]() |
+## Training
+To train a detector with pre-trained models, run:
+```
+# single-gpu training
+python tools/train.py <CONFIG_FILE> --cfg-options model.pretrained=<PRETRAIN_MODEL> [other optional arguments]
+# multi-gpu training
+tools/dist_train.sh <CONFIG_FILE> <GPU_NUM> --cfg-options model.pretrained=<PRETRAIN_MODEL> [other optional arguments]
+```
+For example, to train a Cascade Mask R-CNN model with a `RevCol-T` backbone and 8 gpus, run:
+```
+tools/dist_train.sh configs/revcol/cascade_mask_rcnn_revcol_tiny_3x_in1k.py 8 --cfg-options pretrained=<PRETRAIN_MODEL>
+```
+More config files can be found at [`configs/revcol`](configs/revcol).
+## Inference
+```
+# single-gpu testing
+python tools/test.py <CONFIG_FILE> <DET_CHECKPOINT_FILE> --eval bbox segm
+# multi-gpu testing
+tools/dist_test.sh <CONFIG_FILE> <DET_CHECKPOINT_FILE> <GPU_NUM> --eval bbox segm
+```
+## Acknowledgment
+This code is built using [mmdetection](https://github.com/open-mmlab/mmdetection), [timm](https://github.com/rwightman/pytorch-image-models) libraries, and [BeiT](https://github.com/microsoft/unilm/tree/f8f3df80c65eb5e5fc6d6d3c9bd3137621795d1e/beit), [Swin Transformer](https://github.com/microsoft/Swin-Transformer) repositories.

training/Detection/configs/_base_/models/cascade_mask_rcnn_revcol_fpn.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# model settings
+pretrained = None
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='RevCol',
+        channels=[48, 96, 192, 384],
+        layers=[3, 3, 9, 3],
+        num_subnet=4,
+        drop_path=0.2,
+        save_memory=False,
+        out_indices=[0, 1, 2, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(
+        type='FPN',
+        in_channels=[128, 256, 512, 1024],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg = dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg = dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))

training/Detection/configs/revcol/cascade_mask_rcnn_revcol_base_3x_in1k.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_revcol_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = './cls_model/revcol_base_1k.pth'
+model = dict(
+    backbone=dict(
+        channels = [72, 144, 288, 576],
+        layers=[1, 1, 3, 2],
+        num_subnet=16,
+        drop_path = 0.4,
+        save_memory=False,
+        out_indices=[0, 1, 2, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(in_channels=[72, 144, 288, 576]),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+        ]))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+optimizer = dict(constructor='LearningRateDecayOptimizerConstructor', _delete_=True, type='AdamW',
+                 lr=0.0002, betas=(0.9, 0.999), weight_decay=0.05,
+                 paramwise_cfg={'decay_rate': 0.9,
+                                'decay_type': 'layer_wise',
+                                'layers': [1, 1, 3, 2],
+                                'num_subnet': 16})
+lr_config = dict(step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+# fp16 = None
+# optimizer_config = dict(
+#     type="DistOptimizerHook",
+#     update_interval=1,
+#     grad_clip=None,
+#     coalesce=True,
+#     bucket_size_mb=-1,
+#     use_fp16=True,
+# )
+fp16 = dict(loss_scale='dynamic')

training/Detection/configs/revcol/cascade_mask_rcnn_revcol_base_3x_in22k.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_revcol_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = './cls_model/revcol_base_22k.pth'
+model = dict(
+    backbone=dict(
+        channels = [72, 144, 288, 576],
+        layers=[1, 1, 3, 2],
+        num_subnet=16,
+        drop_path = 0.4,
+        save_memory=False,
+        out_indices=[0, 1, 2, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(in_channels=[72, 144, 288, 576]),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+        ]))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+optimizer = dict(constructor='LearningRateDecayOptimizerConstructor', _delete_=True, type='AdamW',
+                 lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05,
+                 paramwise_cfg={'decay_rate': 0.9,
+                                'decay_type': 'layer_wise',
+                                'layers': [1, 1, 3, 2],
+                                'num_subnet': 16})
+lr_config = dict(step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+# fp16 = None
+# optimizer_config = dict(
+#     type="DistOptimizerHook",
+#     update_interval=1,
+#     grad_clip=None,
+#     coalesce=True,
+#     bucket_size_mb=-1,
+#     use_fp16=True,
+# )
+fp16 = dict(loss_scale='dynamic')

training/Detection/configs/revcol/cascade_mask_rcnn_revcol_large_3x_in22k.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_revcol_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = './cls_model/revcol_large_22k.pth'
+model = dict(
+    backbone=dict(
+        channels = [128, 256, 512, 1024],
+        layers=[1, 2, 6, 2],
+        num_subnet=8,
+        drop_path = 0.5,
+        save_memory=False,
+        out_indices=[0, 1, 2, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(in_channels=[128, 256, 512, 1024]),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+        ]))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+optimizer = dict(constructor='LearningRateDecayOptimizerConstructor', _delete_=True, type='AdamW',
+                 lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05,
+                 paramwise_cfg={'decay_rate': 0.85,
+                                'decay_type': 'layer_wise',
+                                'layers': [1, 2, 6, 2],
+                                'num_subnet': 8})
+lr_config = dict(step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+# fp16 = None
+# optimizer_config = dict(
+#     type="DistOptimizerHook",
+#     update_interval=1,
+#     grad_clip=None,
+#     coalesce=True,
+#     bucket_size_mb=-1,
+#     use_fp16=True,
+# )
+fp16 = dict(loss_scale='dynamic')

training/Detection/configs/revcol/cascade_mask_rcnn_revcol_small_3x_in1k.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_revcol_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = './cls_model/revcol_small_1k.pth'
+model = dict(
+    backbone=dict(
+        channels=[64, 128, 256, 512],
+        layers=[2, 2, 4, 2],
+        num_subnet=8,
+        drop_path = 0.4,
+        save_memory=False,
+        out_indices=[0, 1, 2, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(in_channels=[64, 128, 256, 512]),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+        ]))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+optimizer = dict(constructor='LearningRateDecayOptimizerConstructor', _delete_=True, type='AdamW',
+                 lr=0.0002, betas=(0.9, 0.999), weight_decay=0.05,
+                 paramwise_cfg={'decay_rate': 0.85,
+                                'decay_type': 'layer_wise',
+                                'layers': [2, 2, 4, 2],
+                                'num_subnet': 8})
+lr_config = dict(step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+# fp16 = None
+# optimizer_config = dict(
+#     type="DistOptimizerHook",
+#     update_interval=1,
+#     grad_clip=None,
+#     coalesce=True,
+#     bucket_size_mb=-1,
+#     use_fp16=True,
+# )
+fp16 = dict(loss_scale='dynamic')

training/Detection/configs/revcol/cascade_mask_rcnn_revcol_tiny_3x_in1k.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_revcol_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = './cls_model/revcol_tiny_1k.pth'
+model = dict(
+    backbone=dict(
+        channels=[64, 128, 256, 512],
+        layers=[2, 2, 4, 2],
+        num_subnet=4,
+        drop_path = 0.3,
+        save_memory=False,
+        out_indices=[0, 1, 2, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(in_channels=[64, 128, 256, 512]),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+            dict(
+                type='ConvFCBBoxHead',
+                num_shared_convs=4,
+                num_shared_fcs=1,
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=False,
+                reg_decoded_bbox=True,
+                norm_cfg=dict(type='SyncBN', requires_grad=True),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+        ]))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+optimizer = dict(constructor='LearningRateDecayOptimizerConstructor', _delete_=True, type='AdamW',
+                 lr=0.0002, betas=(0.9, 0.999), weight_decay=0.05,
+                 paramwise_cfg={'decay_rate': 0.85,
+                                'decay_type': 'layer_wise',
+                                'layers': [2, 2, 4, 2],
+                                'num_subnet': 4})
+lr_config = dict(step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+# fp16 = None
+# optimizer_config = dict(
+#     type="DistOptimizerHook",
+#     update_interval=1,
+#     grad_clip=None,
+#     coalesce=True,
+#     bucket_size_mb=-1,
+#     use_fp16=True,
+# )
+fp16 = dict(loss_scale='dynamic')

training/Detection/mmcv_custom/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# -*- coding: utf-8 -*-
+from .checkpoint import load_checkpoint
+from .layer_decay_optimizer_constructor import LearningRateDecayOptimizerConstructor
+from .customized_text import CustomizedTextLoggerHook
+__all__ = ['load_checkpoint', 'LearningRateDecayOptimizerConstructor', 'CustomizedTextLoggerHook']

training/Detection/mmcv_custom/checkpoint.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+import mmcv
+from mmcv.fileio import FileClient
+from mmcv.fileio import load as load_file
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import mkdir_or_exist
+from mmcv.runner import get_dist_info
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    load(module)
+    load = None  # break load->load reference cycle
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+def load_url_dist(url, model_dir=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    return checkpoint
+def load_pavimodel_dist(model_path, map_location=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        model = modelcloud.get(model_path)
+        with TemporaryDirectory() as tmp_dir:
+            downloaded_file = osp.join(tmp_dir, model.name)
+            model.download(downloaded_file)
+            checkpoint = torch.load(downloaded_file, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            model = modelcloud.get(model_path)
+            with TemporaryDirectory() as tmp_dir:
+                downloaded_file = osp.join(tmp_dir, model.name)
+                model.download(downloaded_file)
+                checkpoint = torch.load(
+                    downloaded_file, map_location=map_location)
+    return checkpoint
+def load_fileclient_dist(filename, backend, map_location):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    allowed_backends = ['ceph']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+    if rank == 0:
+        fileclient = FileClient(backend=backend)
+        buffer = io.BytesIO(fileclient.get(filename))
+        checkpoint = torch.load(buffer, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            fileclient = FileClient(backend=backend)
+            buffer = io.BytesIO(fileclient.get(filename))
+            checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+    return default_urls
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+    return mmcls_urls
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+    return deprecate_urls
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+    return new_checkpoint
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('torchvision://'):
+        model_urls = get_torchvision_models()
+        model_name = filename[14:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('open-mmlab://'):
+        model_urls = get_external_models()
+        model_name = filename[13:]
+        deprecated_urls = get_deprecated_model_names()
+        if model_name in deprecated_urls:
+            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+                          f'of open-mmlab://{deprecated_urls[model_name]}')
+            model_name = deprecated_urls[model_name]
+        model_url = model_urls[model_name]
+        # check if is url
+        if model_url.startswith(('http://', 'https://')):
+            checkpoint = load_url_dist(model_url)
+        else:
+            filename = osp.join(_get_mmcv_home(), model_url)
+            if not osp.isfile(filename):
+                raise IOError(f'{filename} is not a checkpoint file')
+            checkpoint = torch.load(filename, map_location=map_location)
+    elif filename.startswith('mmcls://'):
+        model_urls = get_mmcls_models()
+        model_name = filename[8:]
+        checkpoint = load_url_dist(model_urls[model_name])
+        checkpoint = _process_mmcls_checkpoint(checkpoint)
+    elif filename.startswith(('http://', 'https://')):
+        checkpoint = load_url_dist(filename)
+    elif filename.startswith('pavi://'):
+        model_path = filename[7:]
+        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+    elif filename.startswith('s3://'):
+        checkpoint = load_fileclient_dist(
+            filename, backend='ceph', map_location=map_location)
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+    # reshape absolute position embedding
+    if state_dict.get('absolute_pos_embed') is not None:
+        absolute_pos_embed = state_dict['absolute_pos_embed']
+        N1, L, C1 = absolute_pos_embed.size()
+        N2, C2, H, W = model.absolute_pos_embed.size()
+        if N1 != N2 or C1 != C2 or L != H*W:
+            logger.warning("Error in loading absolute_pos_embed, pass")
+        else:
+            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
+    # interpolate position bias table if needed
+    relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+    for table_key in relative_position_bias_table_keys:
+        table_pretrained = state_dict[table_key]
+        table_current = model.state_dict()[table_key]
+        L1, nH1 = table_pretrained.size()
+        L2, nH2 = table_current.size()
+        if nH1 != nH2:
+            logger.warning(f"Error in loading {table_key}, pass")
+        else:
+            if L1 != L2:
+                S1 = int(L1 ** 0.5)
+                S2 = int(L2 ** 0.5)
+                table_pretrained_resized = F.interpolate(
+                     table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+                     size=(S2, S2), mode='bicubic')
+                state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+    if is_module_wrapper(model):
+        model = model.module
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()

training/Detection/mmcv_custom/customized_text.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+from collections import OrderedDict
+import torch
+import mmcv
+from mmcv.runner import HOOKS
+from mmcv.runner import TextLoggerHook
+@HOOKS.register_module()
+class CustomizedTextLoggerHook(TextLoggerHook):
+    """Customized Text Logger hook.
+    This logger prints out both lr and layer_0_lr.
+    """
+    def _log_info(self, log_dict, runner):
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+        if log_dict['mode'] == 'train':
+            lr_str = {}
+            for lr_type in ['lr', 'layer_0_lr']:
+                if isinstance(log_dict[lr_type], dict):
+                    lr_str[lr_type] = []
+                    for k, val in log_dict[lr_type].items():
+                        lr_str.append(f'{lr_type}_{k}: {val:.3e}')
+                    lr_str[lr_type] = ' '.join(lr_str)
+                else:
+                    lr_str[lr_type] = f'{lr_type}: {log_dict[lr_type]:.3e}'
+            # by epoch: Epoch [4][100/1000]
+            # by iter:  Iter [100/100000]
+            if self.by_epoch:
+                log_str = f'Epoch [{log_dict["epoch"]}]' \
+                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
+            else:
+                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
+            log_str += f'{lr_str["lr"]}, {lr_str["layer_0_lr"]}, '
+            if 'time' in log_dict.keys():
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (
+                    runner.iter - self.start_iter + 1)
+                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                log_str += f'eta: {eta_str}, '
+                log_str += f'time: {log_dict["time"]:.3f}, ' \
+                           f'data_time: {log_dict["data_time"]:.3f}, '
+                # statistic memory
+                if torch.cuda.is_available():
+                    log_str += f'memory: {log_dict["memory"]}, '
+        else:
+            # val/test time
+            # here 1000 is the length of the val dataloader
+            # by epoch: Epoch[val] [4][1000]
+            # by iter: Iter[val] [1000]
+            if self.by_epoch:
+                log_str = f'Epoch({log_dict["mode"]}) ' \
+                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
+            else:
+                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
+        log_items = []
+        for name, val in log_dict.items():
+            # TODO: resolve this hack
+            # these items have been in log_str
+            if name in [
+                    'mode', 'Epoch', 'iter', 'lr', 'layer_0_lr', 'time', 'data_time',
+                    'memory', 'epoch'
+            ]:
+                continue
+            if isinstance(val, float):
+                val = f'{val:.4f}'
+            log_items.append(f'{name}: {val}')
+        log_str += ', '.join(log_items)
+        runner.logger.info(log_str)
+    def log(self, runner):
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+        log_dict = OrderedDict(
+            mode=self.get_mode(runner),
+            epoch=self.get_epoch(runner),
+            iter=cur_iter)
+        # record lr and layer_0_lr
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['layer_0_lr'] = min(cur_lr)
+            log_dict['lr'] = max(cur_lr)
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'], log_dict['layer_0_lr'] = {}, {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['layer_0_lr'].update({k: min(lr_)})
+                log_dict['lr'].update({k: max(lr_)})
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+        log_dict = dict(log_dict, **runner.log_buffer.output)
+        self._log_info(log_dict, runner)
+        self._dump_log(log_dict, runner)
+        return log_dict

training/Detection/mmcv_custom/layer_decay_optimizer_constructor.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import re
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner import get_dist_info
+import numpy as np
+def cal_model_depth(depth, num_subnet):
+    dp = np.zeros((depth, num_subnet))
+    dp[:,0]=np.linspace(0, depth-1, depth)
+    dp[0,:]=np.linspace(0, num_subnet-1, num_subnet)
+    for i in range(1, depth):
+        for j in range(1, num_subnet):
+            dp[i][j] = min(dp[i][j-1], dp[i-1][j])+1
+    dp = dp.astype(int)
+    # col = [x for x in np.linspace(0, sum(self.layers)-1, sum(self.layers))]
+    # dp = np.transpose(np.array([col]*self.num_subnet, dtype=int))
+    dp = dp+1 ## make layer id starts from 1
+    return dp
+def get_num_layer_layer_wise(n, layers, num_subnet=12):
+    dp=cal_model_depth(sum(layers), num_subnet)
+    # def get_layer_id(n, dp, layers):
+    if n.startswith("backbone.subnet"):
+        n=n[9:]
+        name_part = n.split('.')
+        subnet = int(name_part[0][6:])
+        if name_part[1].startswith("alpha"):
+            id = dp[0][subnet]
+        else:
+            level = int(name_part[1][-1])
+            if name_part[2].startswith("blocks"):
+                sub = int(name_part[3])
+                if sub>layers[level]-1:
+                    sub = layers[level]-1
+                block = sum(layers[:level])+sub
+            if name_part[2].startswith("fusion"):
+                block = sum(layers[:level])
+            id = dp[block][subnet]
+    elif n.startswith("backbone.stem"):
+        id = 0
+    else:
+        id = dp[-1][-1]+1
+    return id
+@OPTIMIZER_BUILDERS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        parameter_groups = {}
+        print(self.paramwise_cfg)
+        num_layers = cal_model_depth(sum(self.paramwise_cfg.get('layers')), self.paramwise_cfg.get('num_subnet'))[-1][-1]+2
+        # num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', "layer_wise")
+        print("Build LearningRateDecayOptimizerConstructor %s %f - %d" % (decay_type, decay_rate, num_layers))
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith(".bias") or name in ('pos_embed', 'cls_token') or  re.match('(.*).alpha.$', name):
+                group_name = "no_decay"
+                this_weight_decay = 0.
+            else:
+                group_name = "decay"
+                this_weight_decay = weight_decay
+            if decay_type == "layer_wise":
+                layer_id = get_num_layer_layer_wise(name, self.paramwise_cfg.get('layers'), self.paramwise_cfg.get('num_subnet'))
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if group_name not in parameter_groups:
+                scale = decay_rate ** (num_layers - layer_id - 1)
+                parameter_groups[group_name] = {
+                    "weight_decay": this_weight_decay,
+                    "params": [],
+                    "param_names": [],
+                    "lr_scale": scale,
+                    "group_name": group_name,
+                    "lr": scale * self.base_lr,
+                }
+            parameter_groups[group_name]["params"].append(param)
+            parameter_groups[group_name]["param_names"].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    "param_names": parameter_groups[key]["param_names"],
+                    "lr_scale": parameter_groups[key]["lr_scale"],
+                    "lr": parameter_groups[key]["lr"],
+                    "weight_decay": parameter_groups[key]["weight_decay"],
+                }
+            print("Param groups = %s" % json.dumps(to_display, indent=2))
+        params.extend(parameter_groups.values())

training/Detection/mmcv_custom/runner/checkpoint.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Open-MMLab. All rights reserved.
+import os.path as osp
+import time
+from tempfile import TemporaryDirectory
+import torch
+from torch.optim import Optimizer
+import mmcv
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.checkpoint import weights_to_cpu, get_state_dict
+try:
+    import apex
+except:
+    print('apex is not installed')
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+    The checkpoint will have 4 fields: ``meta``, ``state_dict`` and
+    ``optimizer``, ``amp``. By default ``meta`` will contain version
+    and time info.
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+    if is_module_wrapper(model):
+        model = model.module
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+    # save amp state dict in the checkpoint
+    # checkpoint['amp'] = apex.amp.state_dict()
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()

training/Detection/mmdet/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .csp_darknet import CSPDarknet
+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .efficientnet import EfficientNet
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .mobilenet_v2 import MobileNetV2
+from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .swin import SwinTransformer
+from .trident_resnet import TridentResNet
+from .revcol_huge import RevCol_Huge
+from .revcol import RevCol
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet',
+    'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet',
+    'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet',
+    'SwinTransformer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'EfficientNet', 'RevCol'
+]

training/Detection/mmdet/models/backbones/revcol.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from .revcol_module import ConvNextBlock, LayerNorm, UpSampleConvnext
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+from .revcol_function import ReverseFunction
+from mmcv.cnn import constant_init, trunc_normal_init
+from mmcv.runner import BaseModule, _load_checkpoint
+from torch.utils.checkpoint import checkpoint
+class Fusion(nn.Module):
+    def __init__(self, level, channels, first_col) -> None:
+        super().__init__()
+        self.level = level
+        self.first_col = first_col
+        self.down = nn.Sequential(
+                nn.Conv2d(channels[level-1], channels[level], kernel_size=2, stride=2),
+                LayerNorm(channels[level], eps=1e-6, data_format="channels_first"),
+            ) if level in [1, 2, 3] else nn.Identity()
+        if not first_col:
+            self.up = UpSampleConvnext(1, channels[level+1], channels[level]) if level in [0, 1, 2] else nn.Identity()
+    def forward(self, *args):
+        c_down, c_up = args
+        if self.first_col:
+            x = self.down(c_down)
+            return x
+        if self.level == 3:
+            x = self.down(c_down)
+        else:
+            x = self.up(c_up) + self.down(c_down)
+        return x
+class Level(nn.Module):
+    def __init__(self, level, channels, layers, kernel_size, first_col, dp_rate=0.0) -> None:
+        super().__init__()
+        countlayer = sum(layers[:level])
+        expansion = 4
+        self.fusion = Fusion(level, channels, first_col)
+        modules = [ConvNextBlock(channels[level], expansion*channels[level], channels[level], kernel_size = kernel_size,  layer_scale_init_value=1e-6, drop_path=dp_rate[countlayer+i]) for i in range(layers[level])]
+        self.blocks = nn.Sequential(*modules)
+    def forward(self, *args):
+        x = self.fusion(*args)
+        x = self.blocks(x)
+        return x
+class SubNet(nn.Module):
+    def __init__(self, channels, layers, kernel_size, first_col, dp_rates, save_memory) -> None:
+        super().__init__()
+        shortcut_scale_init_value = 0.5
+        self.save_memory = save_memory
+        self.alpha0 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[0], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.alpha1 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[1], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.alpha2 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[2], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.alpha3 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[3], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.level0 = Level(0, channels, layers, kernel_size, first_col, dp_rates)
+        self.level1 = Level(1, channels, layers, kernel_size, first_col, dp_rates)
+        self.level2 = Level(2, channels, layers, kernel_size,first_col, dp_rates)
+        self.level3 = Level(3, channels, layers, kernel_size, first_col, dp_rates)
+    def _forward_nonreverse(self, *args):
+        x, c0, c1, c2, c3= args
+        c0 = (self.alpha0)*c0 + self.level0(x, c1)
+        c1 = (self.alpha1)*c1 + self.level1(c0, c2)
+        c2 = (self.alpha2)*c2 + self.level2(c1, c3)
+        c3 = (self.alpha3)*c3 + self.level3(c2, None)
+        return c0, c1, c2, c3
+    def _forward_reverse(self, *args):
+        local_funs = [self.level0, self.level1, self.level2, self.level3]
+        alpha = [self.alpha0, self.alpha1, self.alpha2, self.alpha3]
+        _, c0, c1, c2, c3 = ReverseFunction.apply(
+            local_funs, alpha, *args)
+        return c0, c1, c2, c3
+    def forward(self, *args):
+        self._clamp_abs(self.alpha0.data, 1e-3)
+        self._clamp_abs(self.alpha1.data, 1e-3)
+        self._clamp_abs(self.alpha2.data, 1e-3)
+        self._clamp_abs(self.alpha3.data, 1e-3)
+        if self.save_memory:
+            return self._forward_reverse(*args)
+        else:
+            return self._forward_nonreverse(*args)
+    def _clamp_abs(self, data, value):
+        with torch.no_grad():
+            sign=data.sign()
+            data.abs_().clamp_(value)
+            data*=sign
+@BACKBONES.register_module()
+class RevCol(BaseModule):
+    def __init__(self, channels=[32, 64, 96, 128], layers=[2, 3, 6, 3], num_subnet=5, kernel_size = 3, num_classes=1000, drop_path = 0.0, save_memory=True, single_head=True, out_indices=[0, 1, 2, 3], init_cfg=None) -> None:
+        super().__init__(init_cfg)
+        self.num_subnet = num_subnet
+        self.single_head = single_head
+        self.out_indices = out_indices
+        self.init_cfg = init_cfg
+        self.stem = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=4, stride=4),
+            LayerNorm(channels[0], eps=1e-6, data_format="channels_first")
+        )
+        # dp_rate = self.cal_dp_rate(sum(layers), num_subnet, drop_path)
+        dp_rate = [x.item() for x in torch.linspace(0, drop_path, sum(layers))]
+        for i in range(num_subnet):
+            first_col = True if i == 0 else False
+            self.add_module(f'subnet{str(i)}', SubNet(
+                channels,layers, kernel_size, first_col, dp_rates=dp_rate, save_memory=save_memory))
+    def init_weights(self):
+        logger = get_root_logger()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = _load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            state_dict = _state_dict
+            # print(state_dict.keys())
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+    def forward(self, x):
+        x = self.stem(x)
+        c0, c1, c2, c3 = 0, 0, 0, 0
+        for i in range(self.num_subnet):
+            # c0, c1, c2, c3 = checkpoint(getattr(self, f'subnet{str(i)}'), x, c0, c1, c2, c3 )
+            c0, c1, c2, c3 = getattr(self, f'subnet{str(i)}')(x, c0, c1, c2, c3)
+        return c0, c1, c2, c3
+    def cal_dp_rate(self, depth, num_subnet, drop_path):
+        dp = np.zeros((depth, num_subnet))
+        dp[:,0]=np.linspace(0, depth-1, depth)
+        dp[0,:]=np.linspace(0, num_subnet-1, num_subnet)
+        for i in range(1, depth):
+            for j in range(1, num_subnet):
+                dp[i][j] = min(dp[i][j-1], dp[i-1][j])+1
+        ratio = dp[-1][-1]/drop_path
+        dp_matrix = dp/ratio
+        return dp_matrix

training/Detection/mmdet/models/backbones/revcol_function.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+from typing import Any, Iterable, List, Tuple, Callable
+import torch.distributed as dist
+def get_gpu_states(fwd_gpu_devices) -> Tuple[List[int], List[torch.Tensor]]:
+    # This will not error out if "arg" is a CPU tensor or a non-tensor type because
+    # the conditionals short-circuit.
+    fwd_gpu_states = []
+    for device in fwd_gpu_devices:
+        with torch.cuda.device(device):
+            fwd_gpu_states.append(torch.cuda.get_rng_state())
+    return fwd_gpu_states
+def get_gpu_device(*args):
+    fwd_gpu_devices = list(set(arg.get_device() for arg in args
+                               if isinstance(arg, torch.Tensor) and arg.is_cuda))
+    return fwd_gpu_devices
+def set_device_states(fwd_cpu_state, devices, states) -> None:
+    torch.set_rng_state(fwd_cpu_state)
+    for device, state in zip(devices, states):
+        with torch.cuda.device(device):
+            torch.cuda.set_rng_state(state)
+def detach_and_grad(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+            x = inp.detach()
+            x.requires_grad = True
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
+def get_cpu_and_gpu_states(gpu_devices):
+    return torch.get_rng_state(), get_gpu_states(gpu_devices)
+class ReverseFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_functions, alpha, *args):
+        l0, l1, l2, l3 = run_functions
+        alpha0, alpha1, alpha2, alpha3 = alpha
+        ctx.run_functions  = run_functions
+        ctx.alpha = alpha
+        ctx.preserve_rng_state = True
+        ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
+                                   "dtype": torch.get_autocast_gpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
+        ctx.cpu_autocast_kwargs = {"enabled": torch.is_autocast_cpu_enabled(),
+                                   "dtype": torch.get_autocast_cpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
+        assert len(args) == 5
+        [x, c0, c1, c2, c3] = args
+        if type(c0) == int:
+            ctx.first_col = True
+        else:
+            ctx.first_col = False
+        with torch.no_grad():
+            if ctx.preserve_rng_state:
+                gpu_devices = get_gpu_device(*args)
+                ctx.gpu_devices = gpu_devices
+                ctx.cpu_states_0, ctx.gpu_states_0  = get_cpu_and_gpu_states(gpu_devices)
+                c0 = l0(x, c1, c3) + c0*alpha0
+                ctx.cpu_states_1, ctx.gpu_states_1  = get_cpu_and_gpu_states(gpu_devices)
+                c1 = l1(c0, c2) + c1*alpha1
+                ctx.cpu_states_2, ctx.gpu_states_2  = get_cpu_and_gpu_states(gpu_devices)
+                c2 = l2(c1, c3) + c2*alpha2
+                ctx.cpu_states_3, ctx.gpu_states_3  = get_cpu_and_gpu_states(gpu_devices)
+                c3 = l3(c2) + c3*alpha3
+            else:
+                c0 = l0(x, c1, c3) + c0*alpha0
+                c1 = l1(c0, c2) + c1*alpha1
+                c2 = l2(c1, c3) + c2*alpha2
+                c3 = l3(c2) + c3*alpha3
+        ctx.save_for_backward(x, c0, c1, c2, c3)
+        return x, c0, c1 ,c2, c3
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        x, c0, c1, c2, c3 = ctx.saved_tensors
+        l0, l1, l2, l3 = ctx.run_functions
+        alpha0, alpha1, alpha2, alpha3 = ctx.alpha
+        gx_right, g0_right, g1_right, g2_right, g3_right = grad_outputs
+        (x, c0, c1, c2, c3) = detach_and_grad((x, c0, c1, c2, c3))
+        if ctx.preserve_rng_state:
+            with torch.enable_grad(), \
+                torch.random.fork_rng(devices=ctx.gpu_devices, enabled=ctx.preserve_rng_state), \
+                torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs), \
+                torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):
+                g3_up = g3_right
+                g3_left = g3_up*alpha3 ##shortcut
+                set_device_states(ctx.cpu_states_3, ctx.gpu_devices, ctx.gpu_states_3)
+                oup3 = l3(c2)
+                torch.autograd.backward(oup3, g3_up, retain_graph=True)
+                with torch.no_grad():
+                    c3_left = (1/alpha3)*(c3 - oup3) ## feature reverse
+                g2_up = g2_right+ c2.grad
+                g2_left = g2_up*alpha2 ##shortcut
+                (c3_left,) = detach_and_grad((c3_left,))
+                set_device_states(ctx.cpu_states_2, ctx.gpu_devices, ctx.gpu_states_2)
+                oup2 = l2(c1, c3_left)
+                torch.autograd.backward(oup2, g2_up, retain_graph=True)
+                c3_left.requires_grad = False
+                cout3 = c3_left*alpha3 ##alpha3 update
+                torch.autograd.backward(cout3, g3_up)
+                with torch.no_grad():
+                    c2_left = (1/alpha2)*(c2 - oup2) ## feature reverse
+                g3_left = g3_left + c3_left.grad if c3_left.grad is not None else g3_left
+                g1_up = g1_right+c1.grad
+                g1_left = g1_up*alpha1 ##shortcut
+                (c2_left,) = detach_and_grad((c2_left,))
+                set_device_states(ctx.cpu_states_1, ctx.gpu_devices, ctx.gpu_states_1)
+                oup1 = l1(c0, c2_left)
+                torch.autograd.backward(oup1, g1_up, retain_graph=True)
+                c2_left.requires_grad = False
+                cout2 = c2_left*alpha2 ##alpha3 update
+                torch.autograd.backward(cout2, g2_up)
+                with torch.no_grad():
+                    c1_left = (1/alpha1)*(c1 - oup1) ## feature reverse
+                g0_up = g0_right + c0.grad
+                g0_left = g0_up*alpha0 ##shortcut
+                g2_left = g2_left + c2_left.grad if c2_left.grad is not None else g2_left ## Fusion
+                (c1_left,c3_left) = detach_and_grad((c1_left,c3_left))
+                set_device_states(ctx.cpu_states_0, ctx.gpu_devices, ctx.gpu_states_0)
+                oup0 = l0(x, c1_left, c3_left)
+                torch.autograd.backward(oup0, g0_up, retain_graph=True)
+                c1_left.requires_grad = False
+                cout1 = c1_left*alpha1 ##alpha3 update
+                torch.autograd.backward(cout1, g1_up)
+                with torch.no_grad():
+                    c0_left = (1/alpha0)*(c0 - oup0) ## feature reverse
+                gx_up = x.grad ## Fusion
+                g1_left = g1_left + c1_left.grad if c1_left.grad is not None else g1_left ## Fusion
+                g3_left = g3_left + c3_left.grad if c3_left.grad is not None else g3_left ## Fusion
+                c0_left.requires_grad = False
+                cout0 = c0_left*alpha0 ##alpha3 update
+                torch.autograd.backward(cout0, g0_up)
+        else:
+            with torch.enable_grad():
+                g3_up = g3_right
+                g3_left = g3_up*alpha3 ##shortcut
+                oup3 = l3(c2)
+                torch.autograd.backward(oup3, g3_up, retain_graph=True)
+                with torch.no_grad():
+                    c3_left = (1/alpha3)*(c3 - oup3) ## feature reverse
+                g2_up = g2_right+ c2.grad
+                g2_left = g2_up*alpha2 ##shortcut
+                (c3_left,) = detach_and_grad((c3_left,))
+                oup2 = l2(c1, c3_left)
+                torch.autograd.backward(oup2, g2_up, retain_graph=True)
+                c3_left.requires_grad = False
+                cout3 = c3_left*alpha3 ##alpha3 update
+                torch.autograd.backward(cout3, g3_up)
+                with torch.no_grad():
+                    c2_left = (1/alpha2)*(c2 - oup2) ## feature reverse
+                g3_left = g3_left + c3_left.grad if c3_left.grad is not None else g3_left
+                g1_up = g1_right+c1.grad
+                g1_left = g1_up*alpha1 ##shortcut
+                (c2_left,) = detach_and_grad((c2_left,))
+                oup1 = l1(c0, c2_left)
+                torch.autograd.backward(oup1, g1_up, retain_graph=True)
+                c2_left.requires_grad = False
+                cout2 = c2_left*alpha2 ##alpha3 update
+                torch.autograd.backward(cout2, g2_up)
+                with torch.no_grad():
+                    c1_left = (1/alpha1)*(c1 - oup1) ## feature reverse
+                g0_up = g0_right + c0.grad
+                g0_left = g0_up*alpha0 ##shortcut
+                g2_left = g2_left + c2_left.grad if c2_left.grad is not None else g2_left ## Fusion
+                (c1_left,c3_left) = detach_and_grad((c1_left,c3_left))
+                oup0 = l0(x, c1_left, c3_left)
+                torch.autograd.backward(oup0, g0_up, retain_graph=True)
+                c1_left.requires_grad = False
+                cout1 = c1_left*alpha1 ##alpha3 update
+                torch.autograd.backward(cout1, g1_up)
+                with torch.no_grad():
+                    c0_left = (1/alpha0)*(c0 - oup0) ## feature reverse
+                gx_up = x.grad ## Fusion
+                g1_left = g1_left + c1_left.grad if c1_left.grad is not None else g1_left ## Fusion
+                g3_left = g3_left + c3_left.grad if c3_left.grad is not None else g3_left ## Fusion
+                c0_left.requires_grad = False
+                cout0 = c0_left*alpha0 ##alpha3 update
+                torch.autograd.backward(cout0, g0_up)
+        # if dist.get_rank()==0:
+        #     print(c0_left.mean().data)
+        #     print(f'c0: {c0_left.max()}, c1: {c1_left.max()}, c2: {c2_left.max()}, c3: {c3_left.max()}')
+        # print(f'x.grad: {gx_up.mean()}, c0.grad: {g0_left.mean()}, c1.grad: {g1_left.mean()}, c2.grad: {g2_left.mean()}, c3.grad: {g3_left.mean()}')
+        # import pdb;pdb.set_trace()
+        if ctx.first_col:
+            # print(f'c0: {c0_left.max()}, c1: {c1_left.max()}, c2: {c2_left.max()}, c3: {c3_left.max()}')
+            return None, None, gx_up, None, None, None, None
+        else:
+            return None, None, gx_up, g0_left, g1_left, g2_left, g3_left

training/Detection/mmdet/models/backbones/revcol_module.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import imp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath
+class UpSampleConvnext(nn.Module):
+    def __init__(self, ratio, inchannel, outchannel):
+        super().__init__()
+        self.ratio = ratio
+        self.channel_reschedule = nn.Sequential(
+                                        # LayerNorm(inchannel, eps=1e-6, data_format="channels_last"),
+                                        nn.Linear(inchannel, outchannel),
+                                        LayerNorm(outchannel, eps=1e-6, data_format="channels_last"))
+        self.upsample  = nn.Upsample(scale_factor=2**ratio, mode='nearest')
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = self.channel_reschedule(x)
+        x = x = x.permute(0, 3, 1, 2)
+        return self.upsample(x)
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_first"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+class ConvNextBlock(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, in_channel, hidden_dim, out_channel, kernel_size=3, layer_scale_init_value=1e-6, drop_path= 0.0):
+        super().__init__()
+        self.dwconv = nn.Conv2d(in_channel, in_channel, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, groups=in_channel) # depthwise conv
+        self.norm = nn.LayerNorm(in_channel, eps=1e-6)
+        self.pwconv1 = nn.Linear(in_channel, hidden_dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(hidden_dim, out_channel)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((out_channel)),
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x

training/Detection/mmdet/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .logger import get_caller_name, get_root_logger, log_img_scale
+from .misc import find_latest_checkpoint, update_data_root
+from .setup_env import setup_multi_processes
+from .optimizer import DistOptimizerHook
+__all__ = [
+    'get_root_logger', 'collect_env', 'find_latest_checkpoint',
+    'update_data_root', 'setup_multi_processes', 'get_caller_name',
+    'log_img_scale', 'DistOptimizerHook'
+]

training/Detection/mmdet/utils/optimizer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from mmcv.runner import OptimizerHook, HOOKS
+try:
+    import apex
+except:
+    print('apex is not installed')
+@HOOKS.register_module()
+class DistOptimizerHook(OptimizerHook):
+    """Optimizer hook for distributed training."""
+    def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.update_interval = update_interval
+        self.use_fp16 = use_fp16
+    def before_run(self, runner):
+        runner.optimizer.zero_grad()
+    def after_train_iter(self, runner):
+        runner.outputs['loss'] /= self.update_interval
+        if self.use_fp16:
+            with apex.amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            runner.outputs['loss'].backward()
+        if self.every_n_iters(runner, self.update_interval):
+            if self.grad_clip is not None:
+                self.clip_grads(runner.model.parameters())
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()

training/INSTRUCTIONS.md ADDED Viewed

	@@ -0,0 +1,158 @@

+# Installation, Training and Evaluation Instructions for Image Classification
+We provide installation, training and evaluation instructions for image classification here.
+## Installation Instructions
+- Clone this repo:
+```bash
+git clone https://github.com/megvii-research/RevCol.git
+cd RevCol
+```
+- Create a conda virtual environment and activate it:
+```bash
+conda create --name revcol python=3.7 -y
+conda activate revcol
+```
+- Install `CUDA>=11.3` with `cudnn>=8` following
+  the [official installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
+- Install `PyTorch>=1.11.0` and `torchvision>=0.12.0` with `CUDA>=11.3`:
+```bash
+conda install pytorch=1.11.0 torchvision=0.12.0 torchaudio=0.11.0 cudatoolkit=11.3 -c pytorch
+```
+- Install `timm==0.5.4`:
+```bash
+pip install timm==0.5.4
+```
+- Install other requirements:
+```bash
+pip install -r requirements.txt
+```
+## Data preparation
+We use standard ImageNet dataset, you can download it from http://image-net.org/. We provide the following two ways to
+load data:
+- For standard imagenet-1k dataset, the file structure should look like:
+  ```bash
+  path-to-imagenet-1k
+  ├── train
+  │   ├── class1
+  │   │   ├── img1.jpeg
+  │   │   ├── img2.jpeg
+  │   │   └── ...
+  │   ├── class2
+  │   │   ├── img3.jpeg
+  │   │   └── ...
+  │   └── ...
+  └── val
+      ├── class1
+      │   ├── img4.jpeg
+      │   ├── img5.jpeg
+      │   └── ...
+      ├── class2
+      │   ├── img6.jpeg
+      │   └── ...
+      └── ...
+  ```
+- For ImageNet-22K dataset, the file structure should look like:
+  ```bash
+  path-to-imagenet-22k
+  ├── class1
+  │   ├── img1.jpeg
+  │   ├── img2.jpeg
+  │   └── ...
+  ├── class2
+  │   ├── img3.jpeg
+  │   └── ...
+  └── ...
+  ```
+- As imagenet-22k has no val set, one way is to use imagenet-1k val set as the evaluation for imagenet 22k dataset. Please remember to map the imagenet-1k label to imagenet-22k.
+  ```bash
+  path-to-imagenet-22k-custom-eval-set
+  ├── class1
+  │   ├── img1.jpeg
+  │   ├── img2.jpeg
+  │   └── ...
+  ├── class2
+  │   ├── img3.jpeg
+  │   └── ...
+  └── ...
+  ```
+## Evaluation
+To evaluate a pre-trained `RevCol` on ImageNet validation set, run:
+```bash
+torchrun --nproc_per_node=<num-of-gpus-to-use> --master_port=23456 main.py --cfg <config-file.yaml> --resume <checkpoint_path> --data-path <imagenet-path> --eval
+```
+For example, to evaluate the `RevCol-T` with a single GPU:
+```bash
+torchrun --nproc_per_node=8 --master_port=23456 main.py --cfg configs/revcol_tiny_1k.yaml --resume path_to_your_model.pth --eval
+```
+## Training from scratch on ImageNet-1K
+To train a `RevCol` on ImageNet from scratch, run:
+```bash
+torchrun --nproc_per_node=<num-of-gpus-to-use> --master_port=23456 main.py  \
+--cfg <config-file> --data-path <imagenet-path> [--batch-size <batch-size-per-gpu> --output <output-directory> --tag <job-tag>]
+```
+**Notes**:
+For example, to train `RevCol` with 8 GPU on a single node for 300 epochs, run:
+`RevCol-T`:
+```bash
+torchrun --nproc_per_node=8 --master_port=23456 main.py --cfg configs/revcol_tiny_1k.yaml --batch-size 128 --data-path <imagenet-path>
+```
+`RevCol-S`:
+```bash
+torchrun --nproc_per_node=8 --master_port=23456 main.py --cfg configs/revcol_small_1k.yaml --batch-size 128 --data-path <imagenet-path>
+```
+`RevCol-B`:
+```bash
+torchrun --nproc_per_node=8 --master_port=23456 main.py --cfg configs/revcol_base_1k.yaml --batch-size 128 --data-path <imagenet-path>
+```
+## Pre-training on ImageNet-22K
+For example, to pre-train a `RevCol-B` model on ImageNet-22K:
+```bash
+torchrun --nproc_per_node=8 --master_port=23456 main.py --cfg configs/revcol_large_22k_pretrain.yaml --batch-size 128 --data-path <imagenet-22k-path> --opt DATA.EVAL_DATA_PATH <imagenet-22k-custom-eval-path>
+```
+## Fine-tuning from a ImageNet-22K(21K) pre-trained model
+For example, to fine-tune a `RevCol-B` model pre-trained on ImageNet-22K(21K):
+```bashs
+torchrun --nproc_per_node=8 --master_port=23456 main.py --cfg configs/revcol_base_1k_384_finetune.yaml --batch-size 64 --data-path <imagenet-22k-path> --finetune revcol_base_22k_pretrained.pth
+```

training/LICENSE ADDED Viewed

	@@ -0,0 +1,190 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   Copyright (c) 2022 Megvii Inc. All rights reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

training/README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# Reversible Column Networks
+This repo is the official implementation of:
+### [Reversible Column Networks](https://arxiv.org/abs/2212.11696)
+[Yuxuan Cai](https://nightsnack.github.io), [Yizhuang Zhou](https://scholar.google.com/citations?user=VRSGDDEAAAAJ), [Qi Han](https://hanqer.github.io), Jianjian Sun, Xiangwen Kong, Jun Li, [Xiangyu Zhang](https://scholar.google.com/citations?user=yuB-cfoAAAAJ) \
+[MEGVII Technology](https://en.megvii.com)\
+International Conference on Learning Representations (ICLR) 2023\
+[\[arxiv\]](https://arxiv.org/abs/2212.11696)
+## Updates
+***2/10/2023***\
+RevCol model weights released.
+***1/21/2023***\
+RevCol was accepted by ICLR 2023!
+***12/23/2022***\
+Initial commits: codes for ImageNet-1k and ImageNet-22k classification are released.
+## To Do List
+- [x] ImageNet-1K and 22k Training Code
+- [x] ImageNet-1K and 22k Model Weights
+- [ ] Cascade Mask R-CNN COCO Object Detection Code & Model Weights
+- [ ] ADE20k Semantic Segmentation Code & Model Weights
+## Introduction
+RevCol is composed of multiple copies of subnetworks, named columns respectively, between which multi-level reversible connections are employed. RevCol coud serves as a foundation model backbone for various tasks in computer vision including classification, detection and segmentation.
+<p align="center">
+<img src="figures/title.png" width=100% height=100%
+class="center">
+</p>
+## Main Results on ImageNet with Pre-trained Models
+| name | pretrain | resolution | #params |FLOPs | acc@1 | pretrained model | finetuned model |
+|:---------------------:| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| RevCol-T | ImageNet-1K | 224x224 | 30M | 4.5G | 82.2 | [baidu](https://pan.baidu.com/s/1iGsbdmFcDpwviCHaajeUnA?pwd=h4tj)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_tiny_1k.pth) | - |
+| RevCol-S | ImageNet-1K | 224x224 | 60M | 9.0G | 83.5 | [baidu](https://pan.baidu.com/s/1hpHfdFrTZIPB5NTwqDMLag?pwd=mxuk)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_small_1k.pth) | - |
+| RevCol-B | ImageNet-1K | 224x224 | 138M | 16.6G | 84.1 |  [baidu](https://pan.baidu.com/s/16XIJ1n8pXPD2cXwnFX6b9w?pwd=j6x9)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_base_1k.pth) |  - |
+| RevCol-B<sup>\*</sup> | ImageNet-22K | 224x224 | 138M | 16.6G | 85.6 |[baidu](https://pan.baidu.com/s/1l8zOFifgC8fZtBpHK2ZQHg?pwd=rh58)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_base_22k.pth)| [baidu](https://pan.baidu.com/s/1HqhDXL6OIQdn1LeM2pewYQ?pwd=1bp3)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_base_22k_1kft_224.pth)|
+| RevCol-B<sup>\*</sup> | ImageNet-22K | 384x384 | 138M | 48.9G | 86.7 |[baidu](https://pan.baidu.com/s/1l8zOFifgC8fZtBpHK2ZQHg?pwd=rh58)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_base_22k.pth)| [baidu](https://pan.baidu.com/s/18G0zAUygKgu58s2AjCBpsw?pwd=rv86)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_base_22k_1kft_384.pth)|
+| RevCol-L<sup>\*</sup> | ImageNet-22K | 224x224 | 273M | 39G | 86.6 |[baidu](https://pan.baidu.com/s/1ueKqh3lFAAgC-vVU34ChYA?pwd=qv5m)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_large_22k.pth)| [baidu](https://pan.baidu.com/s/1CsWmcPcwieMzXE8pVmHh7w?pwd=qd9n)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_large_22k_1kft_224.pth)|
+| RevCol-L<sup>\*</sup> | ImageNet-22K | 384x384 | 273M | 116G | 87.6 |[baidu](https://pan.baidu.com/s/1ueKqh3lFAAgC-vVU34ChYA?pwd=qv5m)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_large_22k.pth)| [baidu](https://pan.baidu.com/s/1VmCE3W3Xw6-Lo4rWrj9Xzg?pwd=x69r)/[github](https://github.com/megvii-research/RevCol/releases/download/checkpoint/revcol_large_22k_1kft_384.pth)|
+## Getting Started
+Please refer to [INSTRUCTIONS.md](INSTRUCTIONS.md) for setting up, training and evaluation details.
+## Acknowledgement
+This repo was inspired by several open source projects. We are grateful for these excellent projects and list them as follows:
+- [timm](https://github.com/rwightman/pytorch-image-models)
+- [Swin Transformer](https://github.com/microsoft/Swin-Transformer)
+- [ConvNeXt](https://github.com/facebookresearch/ConvNeXt)
+- [beit](https://github.com/microsoft/unilm/tree/master/beit)
+## License
+RevCol is released under the [Apache 2.0 license](LICENSE).
+## Contact Us
+If you have any questions about this repo or the original paper, please contact Yuxuan at [email protected].
+## Citation
+```
+@inproceedings{cai2022reversible,
+  title={Reversible Column Networks},
+  author={Cai, Yuxuan and Zhou, Yizhuang and Han, Qi and Sun, Jianjian and Kong, Xiangwen and Li, Jun and Zhang, Xiangyu},
+  booktitle={International Conference on Learning Representations},
+  year={2023},
+  url={https://openreview.net/forum?id=Oc2vlWU0jFY}
+}
+```

training/config.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under TheApache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import os
+import yaml
+from yacs.config import CfgNode as CN
+_C = CN()
+# Base config files
+_C.BASE = ['']
+# -----------------------------------------------------------------------------
+# Data settings
+# -----------------------------------------------------------------------------
+_C.DATA = CN()
+# Batch size for a single GPU, could be overwritten by command line argument
+_C.DATA.BATCH_SIZE = 128
+# Path to dataset, could be overwritten by command line argument
+_C.DATA.DATA_PATH = 'path/to/imagenet'
+# Dataset name
+_C.DATA.DATASET = 'imagenet'
+# Input image size
+_C.DATA.IMG_SIZE = 224
+# Interpolation to resize image (random, bilinear, bicubic)
+_C.DATA.INTERPOLATION = 'bicubic'
+# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
+_C.DATA.PIN_MEMORY = True
+# Number of data loading threads
+_C.DATA.NUM_WORKERS = 8
+# Path to evaluation dataset for ImageNet 22k
+_C.DATA.EVAL_DATA_PATH = 'path/to/eval/data'
+# -----------------------------------------------------------------------------
+# Model settings
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Model type
+_C.MODEL.TYPE = ''
+# Model name
+_C.MODEL.NAME = ''
+# Checkpoint to resume, could be overwritten by command line argument
+_C.MODEL.RESUME = ''
+# Checkpoint to finetune, could be overwritten by command line argument
+_C.MODEL.FINETUNE = ''
+# Number of classes, overwritten in data preparation
+_C.MODEL.NUM_CLASSES = 1000
+# Label Smoothing
+_C.MODEL.LABEL_SMOOTHING = 0.0
+# -----------------------------------------------------------------------------
+# Specific Model settings
+# -----------------------------------------------------------------------------
+_C.REVCOL = CN()
+_C.REVCOL.INTER_SUPV = True
+_C.REVCOL.SAVEMM = True
+_C.REVCOL.FCOE = 4.0
+_C.REVCOL.CCOE = 0.8
+_C.REVCOL.KERNEL_SIZE = 3
+_C.REVCOL.DROP_PATH = 0.1
+_C.REVCOL.HEAD_INIT_SCALE = None
+# -----------------------------------------------------------------------------
+# Training settings
+# -----------------------------------------------------------------------------
+_C.TRAIN = CN()
+_C.TRAIN.START_EPOCH = 0
+_C.TRAIN.EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 5
+_C.TRAIN.WEIGHT_DECAY = 4e-5
+_C.TRAIN.BASE_LR = 0.4
+_C.TRAIN.WARMUP_LR = 0.05
+_C.TRAIN.MIN_LR = 1e-5
+# Clip gradient norm
+_C.TRAIN.CLIP_GRAD = 10.0
+# Auto resume from latest checkpoint
+_C.TRAIN.AUTO_RESUME = True
+# Check point
+_C.TRAIN.USE_CHECKPOINT = False
+_C.TRAIN.AMP = True
+# LR scheduler
+_C.TRAIN.LR_SCHEDULER = CN()
+# LR scheduler
+_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
+# Epoch interval to decay LR, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
+# LR decay rate, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1
+# Optimizer
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'sgd'
+# Optimizer Epsilon fow adamw
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+# Optimizer Betas fow adamw
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+# SGD momentum
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+# Layer Decay
+_C.TRAIN.OPTIMIZER.LAYER_DECAY = 1.0
+# -----------------------------------------------------------------------------
+# Augmentation settings
+# -----------------------------------------------------------------------------
+_C.AUG = CN()
+# Color jitter factor
+_C.AUG.COLOR_JITTER = 0.4
+# Use AutoAugment policy. "v0" or "original"
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+_C.AUG.REPROB = 0.25
+# Random erase mode
+_C.AUG.REMODE = 'pixel'
+# Random erase count
+_C.AUG.RECOUNT = 1
+# Mixup alpha, mixup enabled if > 0
+_C.AUG.MIXUP = 0.8
+# Cutmix alpha, cutmix enabled if > 0
+_C.AUG.CUTMIX = 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+_C.AUG.CUTMIX_MINMAX = None
+# Probability of performing mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_PROB = 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+_C.AUG.MIXUP_MODE = 'batch'
+# -----------------------------------------------------------------------------
+# Testing settings
+# -----------------------------------------------------------------------------
+_C.TEST = CN()
+# Whether to use center crop when testing
+_C.TEST.CROP = True
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+# Path to output folder, overwritten by command line argument
+_C.OUTPUT = 'outputs/'
+# Tag of experiment, overwritten by command line argument
+_C.TAG = 'default'
+# Frequency to save checkpoint
+_C.SAVE_FREQ = 1
+# Frequency to logging info
+_C.PRINT_FREQ = 100
+# Fixed random seed
+_C.SEED = 0
+# Perform evaluation only, overwritten by command line argument
+_C.EVAL_MODE = False
+# Test throughput only, overwritten by command line argument
+_C.THROUGHPUT_MODE = False
+# local rank for DistributedDataParallel, given by command line argument
+_C.LOCAL_RANK = 0
+# EMA
+_C.MODEL_EMA = False
+_C.MODEL_EMA_DECAY = 0.9999
+# Machine
+_C.MACHINE = CN()
+_C.MACHINE.MACHINE_WORLD_SIZE = None
+_C.MACHINE.MACHINE_RANK = None
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as f:
+        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('=> merge config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+def update_config(config, args):
+    _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.opts:
+        config.merge_from_list(args.opts)
+    # merge from specific arguments
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.finetune:
+        config.MODEL.FINETUNE = args.finetune
+    if args.use_checkpoint:
+        config.TRAIN.USE_CHECKPOINT = True
+    if args.output:
+        config.OUTPUT = args.output
+    if args.tag:
+        config.TAG = args.tag
+    if args.eval:
+        config.EVAL_MODE = True
+    if args.model_ema:
+        config.MODEL_EMA = True
+    config.dist_url = args.dist_url
+    # set local rank for distributed training
+    config.LOCAL_RANK = args.local_rank
+    # output folder
+    config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG)
+    config.freeze()
+def get_config(args):
+    """Get a yacs CfgNode object with default values."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    update_config(config, args)
+    return config

training/configs/revcol_base_1k.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: True
+DATA:
+  IMG_SIZE: 224
+  NUM_WORKERS: 6
+MODEL:
+  TYPE: revcol_base
+  NAME: revcol_base
+  LABEL_SMOOTHING: 0.1
+REVCOL:
+  INTER_SUPV: True
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.4
+TRAIN:
+  EPOCHS: 300
+  BASE_LR: 1e-3
+  WARMUP_EPOCHS: 20
+  WEIGHT_DECAY: 0.05
+  WARMUP_LR: 1e-5
+  MIN_LR: 1e-8
+  OPTIMIZER:
+    NAME: 'adamw'
+  CLIP_GRAD: 5.0
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.8
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_base_1k_224_finetune.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 224
+  DATASET: imagenet
+MODEL:
+  TYPE: revcol_base
+  NAME: revcol_base_1k_Finetune_224
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 1000
+REVCOL:
+  INTER_SUPV: False
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.2
+  HEAD_INIT_SCALE: 0.001
+TRAIN:
+  EPOCHS: 30
+  BASE_LR: 1e-4
+  WARMUP_EPOCHS: 0
+  WEIGHT_DECAY: 1e-8
+  WARMUP_LR: 4e-6
+  MIN_LR: 2e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+    LAYER_DECAY: 0.9
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.0
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 0.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 0.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.0
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_base_1k_384_finetune.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 384
+  DATASET: imagenet
+MODEL:
+  TYPE: revcol_base
+  NAME: revcol_base_1k_Finetune_384
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 1000
+REVCOL:
+  INTER_SUPV: False
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.2
+  HEAD_INIT_SCALE: 0.001
+TRAIN:
+  EPOCHS: 30
+  BASE_LR: 1e-4
+  WARMUP_EPOCHS: 0
+  WEIGHT_DECAY: 1e-8
+  WARMUP_LR: 4e-6
+  MIN_LR: 2e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+    LAYER_DECAY: 0.9
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.0
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 0.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 0.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.0
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_base_22k_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 224
+  DATASET: imagenet22k
+  NUM_WORKERS: 6
+MODEL:
+  TYPE: revcol_base
+  NAME: revcol_base_22k_Pretrain
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 21841
+REVCOL:
+  INTER_SUPV: True
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.3
+TRAIN:
+  EPOCHS: 90
+  BASE_LR: 1.25e-4
+  WARMUP_EPOCHS: 5
+  WEIGHT_DECAY: 0.1
+  WARMUP_LR: 1e-5
+  MIN_LR: 1e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+# BN:
+#   USE_PRECISE_STATS: True
+AUG:
+  COLOR_JITTER: 0.4
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.8
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_large_1k_224_finetune.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+PRINT_FREQ: 20
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 224
+  DATASET: imagenet
+MODEL:
+  TYPE: revcol_large
+  NAME: revcol_large_1k_Finetune_224
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 1000
+REVCOL:
+  INTER_SUPV: False
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.3
+  HEAD_INIT_SCALE: 0.001
+TRAIN:
+  EPOCHS: 30
+  BASE_LR: 5e-5
+  WARMUP_EPOCHS: 0
+  WEIGHT_DECAY: 1e-8
+  WARMUP_LR: 4e-6
+  MIN_LR: 2e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+    LAYER_DECAY: 0.8
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.0
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 0.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 0.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.0
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_large_1k_384_finetune.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+PRINT_FREQ: 20
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 384
+  DATASET: imagenet
+MODEL:
+  TYPE: revcol_large
+  NAME: revcol_large_1k_Finetune_384
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 1000
+REVCOL:
+  INTER_SUPV: False
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.3
+  HEAD_INIT_SCALE: 0.001
+TRAIN:
+  EPOCHS: 30
+  BASE_LR: 5e-5
+  WARMUP_EPOCHS: 0
+  WEIGHT_DECAY: 1e-8
+  WARMUP_LR: 4e-6
+  MIN_LR: 2e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+    LAYER_DECAY: 0.8
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.0
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 0.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 0.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.0
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_large_22k_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 224
+  DATASET: imagenet22k
+  NUM_WORKERS: 6
+MODEL:
+  TYPE: revcol_large
+  NAME: revcol_large_22k_Pretrain
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 21841
+REVCOL:
+  INTER_SUPV: True
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.3
+TRAIN:
+  EPOCHS: 90
+  BASE_LR: 1.25e-4
+  WARMUP_EPOCHS: 5
+  WEIGHT_DECAY: 0.1
+  WARMUP_LR: 1e-5
+  MIN_LR: 1e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+AUG:
+  COLOR_JITTER: 0.4
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.8
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_small_1k.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: True
+DATA:
+  IMG_SIZE: 224
+  NUM_WORKERS: 6
+MODEL:
+  TYPE: revcol_small
+  NAME: revcol_small
+  LABEL_SMOOTHING: 0.1
+REVCOL:
+  INTER_SUPV: True
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.3
+TRAIN:
+  EPOCHS: 300
+  BASE_LR: 1e-3
+  WARMUP_EPOCHS: 20
+  WEIGHT_DECAY: 0.05
+  WARMUP_LR: 1e-5
+  MIN_LR: 1e-6
+  OPTIMIZER:
+    NAME: 'adamw'
+  CLIP_GRAD: 0.0
+AUG:
+  COLOR_JITTER: 0.4
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.8
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_tiny_1k.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: True
+DATA:
+  IMG_SIZE: 224
+  NUM_WORKERS: 6
+MODEL:
+  TYPE: revcol_tiny
+  NAME: revcol_tiny
+  LABEL_SMOOTHING: 0.1
+REVCOL:
+  INTER_SUPV: True
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.1
+TRAIN:
+  EPOCHS: 300
+  BASE_LR: 1e-3
+  WARMUP_EPOCHS: 20
+  WEIGHT_DECAY: 0.05
+  WARMUP_LR: 1e-5
+  MIN_LR: 1e-6
+  OPTIMIZER:
+    NAME: 'adamw'
+  CLIP_GRAD: 0.0
+AUG:
+  COLOR_JITTER: 0.4
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.8
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_xlarge_1k_384_finetune.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+PRINT_FREQ: 30
+SAVE_FREQ: 1
+MODEL_EMA: True
+DATA:
+  IMG_SIZE: 384
+  DATASET: imagenet
+  PIPE_NAME: 'dpflow.silvia.imagenet.train.rand-re-jitt.384'
+  NUM_WORKERS: 8
+MODEL:
+  TYPE: revcol_xlarge
+  NAME: revcol_xlarge_1k_Finetune
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 1000
+REVCOL:
+  INTER_SUPV: False
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.4
+  HEAD_INIT_SCALE: 0.001
+TRAIN:
+  EPOCHS: 30
+  BASE_LR: 2e-5
+  WARMUP_EPOCHS: 0
+  WEIGHT_DECAY: 1e-8
+  WARMUP_LR: 4e-6
+  MIN_LR: 2e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+    LAYER_DECAY: 0.8
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.0
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 0.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 0.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.0
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/configs/revcol_xlarge_22k_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+PRINT_FREQ: 100
+SAVE_FREQ: 1
+MODEL_EMA: False
+DATA:
+  IMG_SIZE: 224
+  DATASET: imagenet22k
+  NUM_WORKERS: 8
+MODEL:
+  TYPE: revcol_xlarge
+  NAME: revcol_xlarge_22k_Pretrain
+  LABEL_SMOOTHING: 0.1
+  NUM_CLASSES: 21841
+REVCOL:
+  INTER_SUPV: True
+  SAVEMM: True
+  FCOE: 3.0
+  CCOE: 0.7
+  DROP_PATH: 0.4
+TRAIN:
+  EPOCHS: 90
+  BASE_LR: 1.25e-4
+  WARMUP_EPOCHS: 5
+  WEIGHT_DECAY: 0.1
+  WARMUP_LR: 1e-5
+  MIN_LR: 1e-7
+  OPTIMIZER:
+    NAME: 'adamw'
+AUG:
+  COLOR_JITTER: 0.0
+# Use AutoAugment policy. "v0" or "original"
+  AUTO_AUGMENT: 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+  REPROB: 0.25
+# Random erase mode
+  REMODE: 'pixel'
+# Random erase count
+  RECOUNT: 1
+# Mixup alpha, mixup enabled if > 0
+  MIXUP:  0.8
+# Cutmix alpha, cutmix enabled if > 0
+  CUTMIX: 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+  CUTMIX_MINMAX: None
+# Probability of performing mixup or cutmix when either/both is enabled
+  MIXUP_PROB: 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+  MIXUP_SWITCH_PROB: 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+  MIXUP_MODE: 'batch'

training/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .build_data import build_loader

training/data/build_data.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import queue
+from typing import Dict, Sequence
+import warnings
+import os
+import torch
+import numpy as np
+import torch.distributed as dist
+from torchvision import datasets, transforms
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data import Mixup
+from timm.data import create_transform
+from .samplers import SubsetRandomSampler
+def build_loader(config):
+    config.defrost()
+    dataset_train, _ = build_dataset(is_train=True, config=config)
+    config.freeze()
+    print(f"global rank {dist.get_rank()} successfully build train dataset")
+    sampler_train = torch.utils.data.DistributedSampler(
+        dataset_train,  shuffle=True
+    )
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=config.DATA.BATCH_SIZE,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=True,
+        persistent_workers=True
+    )
+    #-----------------------------------Val Dataset-----------------------------------
+    dataset_val, _ = build_dataset(is_train=False, config=config)
+    print(f"global rank {dist.get_rank()} successfully build val dataset")
+    indices = np.arange(dist.get_rank(), len(dataset_val), dist.get_world_size())
+    sampler_val = SubsetRandomSampler(indices)
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False,
+        persistent_workers=True
+    )
+    # setup mixup / cutmix
+    mixup_fn = None
+    mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
+    if mixup_active:
+        mixup_fn = Mixup(
+            mixup_alpha=config.AUG.MIXUP, cutmix_alpha=config.AUG.CUTMIX, cutmix_minmax=config.AUG.CUTMIX_MINMAX,
+            prob=config.AUG.MIXUP_PROB, switch_prob=config.AUG.MIXUP_SWITCH_PROB, mode=config.AUG.MIXUP_MODE,
+            label_smoothing=config.MODEL.LABEL_SMOOTHING, num_classes=config.MODEL.NUM_CLASSES)
+    return dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn
+def build_dataset(is_train, config):
+    transform = build_transform(is_train, config)
+    if config.DATA.DATASET == 'imagenet':
+        prefix = 'train' if is_train else 'val'
+        root = os.path.join(config.DATA.DATA_PATH, prefix)
+        dataset = datasets.ImageFolder(root, transform=transform)
+        nb_classes = 1000
+    elif config.DATA.DATASET == 'imagenet22K':
+        if is_train:
+            root = config.DATA.DATA_PATH
+        else:
+            root = config.DATA.EVAL_DATA_PATH
+        dataset = datasets.ImageFolder(root, transform=transform)
+        nb_classes = 21841
+    else:
+        raise NotImplementedError("We only support ImageNet Now.")
+    return dataset, nb_classes
+def build_transform(is_train, config):
+    resize_im = config.DATA.IMG_SIZE > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=config.DATA.IMG_SIZE,
+            is_training=True,
+            color_jitter=config.AUG.COLOR_JITTER if config.AUG.COLOR_JITTER > 0 else None,
+            auto_augment=config.AUG.AUTO_AUGMENT if config.AUG.AUTO_AUGMENT != 'none' else None,
+            re_prob=config.AUG.REPROB,
+            re_mode=config.AUG.REMODE,
+            re_count=config.AUG.RECOUNT,
+            interpolation=config.DATA.INTERPOLATION,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4)
+        return transform
+    t = []
+    if resize_im:
+        if config.DATA.IMG_SIZE > 224:
+            t.append(
+            transforms.Resize((config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
+                            interpolation=transforms.InterpolationMode.BICUBIC),
+        )
+            print(f"Warping {config.DATA.IMG_SIZE} size input images...")
+        elif config.TEST.CROP:
+            size = int((256 / 224) * config.DATA.IMG_SIZE)
+            t.append(
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC),
+                # to maintain same ratio w.r.t. 224 images
+            )
+            t.append(transforms.CenterCrop(config.DATA.IMG_SIZE))
+        else:
+            t.append(
+                transforms.Resize((config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
+                                  interpolation=transforms.InterpolationMode.BICUBIC)
+            )
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)

training/data/samplers.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import torch
+class SubsetRandomSampler(torch.utils.data.Sampler):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+    Arguments:
+        indices (sequence): a sequence of indices
+    """
+    def __init__(self, indices):
+        self.epoch = 0
+        self.indices = indices
+    def __iter__(self):
+        return (self.indices[i] for i in torch.randperm(len(self.indices)))
+    def __len__(self):
+        return len(self.indices)
+    def set_epoch(self, epoch):
+        self.epoch = epoch

training/figures/title.png ADDED Viewed

training/logger.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import os
+import sys
+import logging
+import functools
+from termcolor import colored
+@functools.lru_cache()
+def create_logger(output_dir, dist_rank=0, name=''):
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    # create formatter
+    fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+    color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \
+                colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
+    # create console handlers for master process
+    if dist_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+    # create file handlers
+    file_handler = logging.FileHandler(os.path.join(output_dir, f'log_rank{dist_rank}.txt'), mode='a')
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+    logger.addHandler(file_handler)
+    return logger

training/loss.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+from dis import dis
+import torch
+from torch import nn
+import torch.distributed as dist
+from torch.functional import Tensor
+import torch.nn.functional as F
+def compound_loss(coe, output_feature, image:Tensor, output_label, targets, criterion_bce, criterion_ce, epoch):
+    f_coe, c_coe = coe
+    image.clamp_(0.01, 0.99)
+    multi_loss = []
+    for i, feature in enumerate(output_feature):
+        ratio_f = 1 - i / len(output_feature)
+        ratio_c = (i+1) / (len(output_label))
+        ihx = criterion_bce(feature, image) * ratio_f * f_coe
+        ihy = criterion_ce(output_label[i], targets) * ratio_c * c_coe
+        # if dist.get_rank() == 0:
+        #     print(f'ihx: {ihx}, ihy: {ihy}')
+        multi_loss.append(ihx + ihy)
+        # feature_loss.append(torch.dist(output_feature[i], teacher_feature) *  feature_coe)
+    multi_loss.append(criterion_ce(output_label[-1], targets))
+    # print(feature_loss)
+    loss = torch.sum(torch.stack(multi_loss), dim=0)
+    # +torch.mean(torch.stack(feature_loss), dim=0)
+    return loss, multi_loss

training/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import torch
+from timm.scheduler.cosine_lr import CosineLRScheduler
+from timm.scheduler.step_lr import StepLRScheduler
+from timm.scheduler.scheduler import Scheduler
+def build_scheduler(config, optimizer=None):
+    lr_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == 'cosine':
+        lr_scheduler = CosLRScheduler()
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'multistep':
+        lr_scheduler = StepLRScheduler()
+    else:
+        raise NotImplementedError(f"Unkown lr scheduler: {config.TRAIN.LR_SCHEDULER.NAME}")
+    return lr_scheduler
+import math
+class CosLRScheduler():
+    def __init__(self) -> None:
+        pass
+    def step_update(self, optimizer, epoch, config):
+        """Decay the learning rate with half-cycle cosine after warmup"""
+        if epoch < config.TRAIN.WARMUP_EPOCHS:
+            lr = (config.TRAIN.BASE_LR-config.TRAIN.WARMUP_LR) * epoch / config.TRAIN.WARMUP_EPOCHS + config.TRAIN.WARMUP_LR
+        else:
+            lr = config.TRAIN.MIN_LR + (config.TRAIN.BASE_LR - config.TRAIN.MIN_LR) * 0.5 * \
+                (1. + math.cos(math.pi * (epoch - config.TRAIN.WARMUP_EPOCHS ) / (config.TRAIN.EPOCHS  - config.TRAIN.WARMUP_EPOCHS )))
+        for param_group in optimizer.param_groups:
+            if "lr_scale" in param_group:
+                param_group["lr"] = lr * param_group["lr_scale"]
+            else:
+                param_group["lr"] = lr
+        return lr
+class LinearLRScheduler(Scheduler):
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 t_initial: int,
+                 lr_min_rate: float,
+                 warmup_t=0,
+                 warmup_lr_init=0.,
+                 t_in_epochs=True,
+                 noise_range_t=None,
+                 noise_pct=0.67,
+                 noise_std=1.0,
+                 noise_seed=42,
+                 initialize=True,
+                 ) -> None:
+        super().__init__(
+            optimizer, param_group_field="lr",
+            noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
+            initialize=initialize)
+        self.t_initial = t_initial
+        self.lr_min_rate = lr_min_rate
+        self.warmup_t = warmup_t
+        self.warmup_lr_init = warmup_lr_init
+        self.t_in_epochs = t_in_epochs
+        if self.warmup_t:
+            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
+            super().update_groups(self.warmup_lr_init)
+        else:
+            self.warmup_steps = [1 for _ in self.base_values]
+    def _get_lr(self, t):
+        if t < self.warmup_t:
+            lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
+        else:
+            t = t - self.warmup_t
+            total_t = self.t_initial - self.warmup_t
+            lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values]
+        return lrs
+    def get_epoch_values(self, epoch: int):
+        if self.t_in_epochs:
+            return self._get_lr(epoch)
+        else:
+            return None
+    def get_update_values(self, num_updates: int):
+        if not self.t_in_epochs:
+            return self._get_lr(num_updates)
+        else:
+            return None

training/main.py ADDED Viewed

	@@ -0,0 +1,422 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import math
+import os
+import subprocess
+import sys
+import time
+import argparse
+import datetime
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torchvision.transforms.functional as visionF
+import torch.cuda.amp as amp
+from typing import Optional
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import accuracy, AverageMeter
+from timm.utils import ModelEma as ModelEma
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from config import get_config
+from models import *
+from loss import *
+from data import build_loader
+from lr_scheduler import build_scheduler
+from optimizer import build_optimizer
+from logger import create_logger
+from utils import denormalize, load_checkpoint, load_checkpoint_finetune, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor
+from torch.utils.tensorboard import SummaryWriter
+scaler = amp.GradScaler()
+logger = None
+def parse_option():
+    parser = argparse.ArgumentParser('Swin Transformer training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+',
+    )
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, default=128, help="batch size for single GPU")
+    parser.add_argument('--data-path', type=str, default='data', help='path to dataset')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--finetune', help='finetune from checkpoint')
+    parser.add_argument('--use-checkpoint', action='store_true',
+                        help="whether to use gradient checkpointing to save memory")
+    parser.add_argument('--output', default='outputs/', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    # ema
+    parser.add_argument('--model-ema', action='store_true')
+    # distributed training
+    parser.add_argument("--local_rank", type=int, required=False, help='local rank for DistributedDataParallel')
+    parser.add_argument('--dist-url', default='env://', type=str,
+                    help='url used to set up distributed training')
+    args, unparsed = parser.parse_known_args()
+    # print(args)
+    config = get_config(args)
+    return args, config
+def main(config):
+    config.defrost()
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
+    else:
+        rank = -1
+        world_size = -1
+        return
+    # linear scale the learning rate according to total batch size, base bs 1024
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size / 1024.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * world_size / 1024.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * world_size / 1024.0
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+    dist.init_process_group(
+        backend='nccl', init_method=config.dist_url,
+        world_size=world_size, rank=rank,
+    )
+    seed = config.SEED + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.set_device(rank)
+    global logger
+    logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}")
+    logger.info(config.dump())
+    writer = None
+    if dist.get_rank() == 0:
+        writer = SummaryWriter(config.OUTPUT)
+    dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn = build_loader(config)
+    logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}")
+    model = build_model(config)
+    model.cuda()
+    logger.info(str(model)[:10000])
+    model_ema = None
+    if config.MODEL_EMA:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but
+        # before SyncBN and DDP wrapper
+        logger.info(f"Using EMA...")
+        model_ema = ModelEma(
+            model,
+            decay=config.MODEL_EMA_DECAY,
+        )
+    optimizer = build_optimizer(config, model)
+    if config.TRAIN.AMP:
+        logger.info(f"-------------------------------Using Pytorch AMP...--------------------------------")
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False, find_unused_parameters=False)
+    # model._set_static_graph()
+    model_without_ddp = model.module
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"number of params: {n_parameters}")
+    lr_scheduler = build_scheduler(config)
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+    criterion_bce = torch.nn.BCEWithLogitsLoss()
+    max_accuracy = 0.0
+    if config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT, logger)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}")
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+    if config.MODEL.RESUME:
+        max_accuracy = load_checkpoint(config, model_without_ddp, optimizer, logger, model_ema)
+        logger.info(f"Start validation")
+        acc1, acc5, loss = validate(config, data_loader_val, model, writer, epoch=config.TRAIN.START_EPOCH)
+        logger.info(f"Accuracy of the network on the 50000 test images: {acc1:.1f}, {acc5:.1f}%")
+        if config.EVAL_MODE:
+            return
+    if config.MODEL.FINETUNE:
+        load_checkpoint_finetune(config, model_without_ddp, logger)
+    logger.info("Start training")
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+        train_one_epoch(config, model, criterion, criterion_bce, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler, writer, model_ema)
+        acc1, acc5, _ = validate(config, data_loader_val, model, writer, epoch)
+        logger.info(f"Accuracy of the network on the 5000 test images: {acc1:.2f}, {acc5:.2f}%")
+        if config.MODEL_EMA:
+            acc1_ema, acc5_ema, _ = validate_ema(config, data_loader_val, model_ema.ema, writer, epoch)
+            logger.info(f"Accuracy of the EMA network on the 5000 test images: {acc1_ema:.1f}, {acc5_ema:.1f}%")
+            # acc1 = max(acc1, acc1_ema)
+        if dist.get_rank() == 0 and epoch % config.SAVE_FREQ == 0:
+            save_checkpoint(config, epoch, model_without_ddp, acc1, max_accuracy, optimizer, logger, model_ema)
+        max_accuracy = max(max_accuracy, acc1)
+        logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+def train_one_epoch(config, model, criterion_ce, criterion_bce, data_loader, optimizer, epoch, mixup_fn, lr_scheduler, writer, model_ema: Optional[ModelEma] = None):
+    global logger
+    model.train()
+    optimizer.zero_grad()
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = AverageMeter()
+    data_time = AverageMeter()
+    start = time.time()
+    end = time.time()
+    for idx, (samples, targets) in enumerate(data_loader):
+        samples = samples.cuda(non_blocking=True)
+        targets = targets.cuda(non_blocking=True)
+        data_time.update(time.time()-end)
+        lr_scheduler.step_update(optimizer, idx / num_steps + epoch, config)
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+        with amp.autocast(enabled=config.TRAIN.AMP):
+            output_label, output_feature = model(samples)
+            if len(output_label) == 1:
+                loss = criterion_ce(output_label[0], targets)
+                multi_loss = []
+            else:
+                loss, multi_loss = compound_loss((config.REVCOL.FCOE, config.REVCOL.CCOE), output_feature, denormalize(samples, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), output_label, targets, criterion_bce, criterion_ce, epoch)
+        if not math.isfinite(loss.item()):
+            print("Loss is {} in iteration {}, multiloss {}, !".format(loss.item(), idx, multi_loss))
+        scaler.scale(loss).backward()
+        if config.TRAIN.CLIP_GRAD:
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+        else:
+            scaler.unscale_(optimizer)
+            grad_norm = get_grad_norm(model.parameters())
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad()
+        if model_ema is not None:
+            model_ema.update(model)
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(grad_norm)
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if dist.get_rank() == 0 and idx%10 == 0:
+            writer.add_scalar('Train/train_loss',loss_meter.val, epoch * num_steps + idx )
+            writer.add_scalar('Train/grad_norm',norm_meter.val, epoch * num_steps + idx )
+            for i, subloss in enumerate(multi_loss):
+                writer.add_scalar(f'Train/sub_loss{i}', subloss, epoch * num_steps + idx)
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[-1]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'datatime {data_time.val:.4f} ({data_time.avg:.4f})\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")
+@torch.no_grad()
+def validate(config, data_loader, model, writer, epoch):
+    global logger
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter_list = []
+    acc5_meter_list = []
+    for i in range(4):
+        acc1_meter_list.append(AverageMeter())
+        acc5_meter_list.append(AverageMeter())
+    end = time.time()
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        # compute output
+        outputs,_ = model(images)
+        if len(acc1_meter_list) != len(outputs):
+            acc1_meter_list = acc1_meter_list[:len(outputs)]
+            acc5_meter_list = acc5_meter_list[:len(outputs)]
+        output_last = outputs[-1]
+        loss = criterion(output_last, target)
+        loss = reduce_tensor(loss)
+        loss_meter.update(loss.item(), target.size(0))
+        for i, subnet_out in enumerate(outputs):
+            acc1, acc5 = accuracy(subnet_out, target, topk=(1, 5))
+            acc1 = reduce_tensor(acc1)
+            acc5 = reduce_tensor(acc5)
+            acc1_meter_list[i].update(acc1.item(), target.size(0))
+            acc5_meter_list[i].update(acc5.item(), target.size(0))
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(
+                f'Test: [{idx}/{len(data_loader)}]\t'
+                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'Acc@1 {acc1_meter_list[-1].val:.3f} ({acc1_meter_list[-1].avg:.3f})\t'
+                f'Acc@5 {acc5_meter_list[-1].val:.3f} ({acc5_meter_list[-1].avg:.3f})\t'
+                f'Mem {memory_used:.0f}MB')
+    logger.info(f' * Acc@1 {acc1_meter_list[-1].avg:.3f} Acc@5 {acc5_meter_list[-1].avg:.3f}')
+    if dist.get_rank() == 0:
+        for i in range(len(acc1_meter_list)):
+            writer.add_scalar(f'Val_top1/acc_{i}', acc1_meter_list[i].avg, epoch)
+            writer.add_scalar(f'Val_top5/acc_{i}', acc5_meter_list[i].avg, epoch)
+    return acc1_meter_list[-1].avg, acc5_meter_list[-1].avg, loss_meter.avg
+@torch.no_grad()
+def validate_ema(config, data_loader, model, writer, epoch):
+    global logger
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+    end = time.time()
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        outputs,_ = model(images)
+        output_last = outputs[-1]
+        loss = criterion(output_last, target)
+        loss = reduce_tensor(loss)
+        loss_meter.update(loss.item(), target.size(0))
+        acc1, acc5 = accuracy(output_last, target, topk=(1, 5))
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(
+                f'Test: [{idx}/{len(data_loader)}]\t'
+                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                f'Mem {memory_used:.0f}MB')
+    logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+if __name__ == '__main__':
+    _, config = parse_option()
+    cudnn.benchmark = True
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    ngpus_per_node = torch.cuda.device_count()
+    main(None, config, ngpus_per_node)

training/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .build import build_model

training/models/build.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import torch
+from models.revcol import *
+def build_model(config):
+    model_type = config.MODEL.TYPE
+    ##-------------------------------------- revcol tiny ----------------------------------------------------------------------------------------------------------------------#
+    if model_type == "revcol_tiny":
+        model = revcol_tiny(save_memory=config.REVCOL.SAVEMM, inter_supv=config.REVCOL.INTER_SUPV, drop_path = config.REVCOL.DROP_PATH, num_classes=config.MODEL.NUM_CLASSES, kernel_size = config.REVCOL.KERNEL_SIZE)
+    ##-------------------------------------- revcol small ----------------------------------------------------------------------------------------------------------------------#
+    elif model_type == "revcol_small":
+        model = revcol_small(save_memory=config.REVCOL.SAVEMM, inter_supv=config.REVCOL.INTER_SUPV, drop_path = config.REVCOL.DROP_PATH, num_classes=config.MODEL.NUM_CLASSES, kernel_size = config.REVCOL.KERNEL_SIZE)
+    ##-------------------------------------- revcol base ----------------------------------------------------------------------------------------------------------------------#
+    elif model_type == "revcol_base":
+        model = revcol_base(save_memory=config.REVCOL.SAVEMM, inter_supv=config.REVCOL.INTER_SUPV, drop_path = config.REVCOL.DROP_PATH, num_classes=config.MODEL.NUM_CLASSES  ,  kernel_size = config.REVCOL.KERNEL_SIZE)
+    ##-------------------------------------- revcol large ----------------------------------------------------------------------------------------------------------------------#
+    elif model_type == "revcol_large":
+        model = revcol_large(save_memory=config.REVCOL.SAVEMM, inter_supv=config.REVCOL.INTER_SUPV, drop_path = config.REVCOL.DROP_PATH, num_classes=config.MODEL.NUM_CLASSES , head_init_scale=config.REVCOL.HEAD_INIT_SCALE,  kernel_size = config.REVCOL.KERNEL_SIZE)
+    ##-------------------------------------- revcol xlarge ----------------------------------------------------------------------------------------------------------------------#
+    elif model_type == "revcol_xlarge":
+        model = revcol_xlarge(save_memory=config.REVCOL.SAVEMM, inter_supv=config.REVCOL.INTER_SUPV, drop_path = config.REVCOL.DROP_PATH, num_classes=config.MODEL.NUM_CLASSES , head_init_scale=config.REVCOL.HEAD_INIT_SCALE,  kernel_size = config.REVCOL.KERNEL_SIZE)
+    else:
+        raise NotImplementedError(f"Unkown model: {model_type}")
+    return model

training/models/modules.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import imp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath
+class UpSampleConvnext(nn.Module):
+    def __init__(self, ratio, inchannel, outchannel):
+        super().__init__()
+        self.ratio = ratio
+        self.channel_reschedule = nn.Sequential(
+                                        # LayerNorm(inchannel, eps=1e-6, data_format="channels_last"),
+                                        nn.Linear(inchannel, outchannel),
+                                        LayerNorm(outchannel, eps=1e-6, data_format="channels_last"))
+        self.upsample  = nn.Upsample(scale_factor=2**ratio, mode='nearest')
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = self.channel_reschedule(x)
+        x = x = x.permute(0, 3, 1, 2)
+        return self.upsample(x)
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_first", elementwise_affine = True):
+        super().__init__()
+        self.elementwise_affine = elementwise_affine
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(normalized_shape))
+            self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            if self.elementwise_affine:
+                x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+class ConvNextBlock(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, in_channel, hidden_dim, out_channel, kernel_size=3, layer_scale_init_value=1e-6, drop_path= 0.0):
+        super().__init__()
+        self.dwconv = nn.Conv2d(in_channel, in_channel, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, groups=in_channel) # depthwise conv
+        self.norm = nn.LayerNorm(in_channel, eps=1e-6)
+        self.pwconv1 = nn.Linear(in_channel, hidden_dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(hidden_dim, out_channel)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((out_channel)),
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        # print(f"x min: {x.min()}, x max: {x.max()}, input min: {input.min()}, input max: {input.max()}, x mean: {x.mean()}, x var: {x.var()}, ratio: {torch.sum(x>8)/x.numel()}")
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, depth=[2,2,2,2], dim=[112, 72, 40, 24], block_type = None, kernel_size = 3) -> None:
+        super().__init__()
+        self.depth = depth
+        self.dim = dim
+        self.block_type = block_type
+        self._build_decode_layer(dim, depth, kernel_size)
+        self.projback = nn.Sequential(
+            nn.Conv2d(
+                in_channels=dim[-1],
+                out_channels=4 ** 2 * 3, kernel_size=1),
+            nn.PixelShuffle(4),
+        )
+    def _build_decode_layer(self, dim, depth, kernel_size):
+        normal_layers = nn.ModuleList()
+        upsample_layers = nn.ModuleList()
+        proj_layers = nn.ModuleList()
+        norm_layer = LayerNorm
+        for i in range(1, len(dim)):
+            module = [self.block_type(dim[i], dim[i], dim[i], kernel_size) for _ in range(depth[i])]
+            normal_layers.append(nn.Sequential(*module))
+            upsample_layers.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
+            proj_layers.append(nn.Sequential(
+                nn.Conv2d(dim[i-1], dim[i], 1, 1),
+                norm_layer(dim[i]),
+                nn.GELU()
+                ))
+        self.normal_layers = normal_layers
+        self.upsample_layers = upsample_layers
+        self.proj_layers = proj_layers
+    def _forward_stage(self, stage, x):
+        x = self.proj_layers[stage](x)
+        x = self.upsample_layers[stage](x)
+        return self.normal_layers[stage](x)
+    def forward(self, c3):
+        x = self._forward_stage(0, c3) #14
+        x = self._forward_stage(1, x) #28
+        x = self._forward_stage(2, x) #56
+        x = self.projback(x)
+        return x
+class SimDecoder(nn.Module):
+    def __init__(self, in_channel, encoder_stride) -> None:
+        super().__init__()
+        self.projback = nn.Sequential(
+            LayerNorm(in_channel),
+            nn.Conv2d(
+                in_channels=in_channel,
+                out_channels=encoder_stride ** 2 * 3, kernel_size=1),
+            nn.PixelShuffle(encoder_stride),
+        )
+    def forward(self, c3):
+        return self.projback(c3)

training/models/revcol.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+from models.modules import ConvNextBlock, Decoder, LayerNorm, SimDecoder, UpSampleConvnext
+import torch.distributed as dist
+from models.revcol_function import ReverseFunction
+from timm.models.layers import trunc_normal_
+class Fusion(nn.Module):
+    def __init__(self, level, channels, first_col) -> None:
+        super().__init__()
+        self.level = level
+        self.first_col = first_col
+        self.down = nn.Sequential(
+                nn.Conv2d(channels[level-1], channels[level], kernel_size=2, stride=2),
+                LayerNorm(channels[level], eps=1e-6, data_format="channels_first"),
+            ) if level in [1, 2, 3] else nn.Identity()
+        if not first_col:
+            self.up = UpSampleConvnext(1, channels[level+1], channels[level]) if level in [0, 1, 2] else nn.Identity()
+    def forward(self, *args):
+        c_down, c_up = args
+        if self.first_col:
+            x = self.down(c_down)
+            return x
+        if self.level == 3:
+            x = self.down(c_down)
+        else:
+            x = self.up(c_up) + self.down(c_down)
+        return x
+class Level(nn.Module):
+    def __init__(self, level, channels, layers, kernel_size, first_col, dp_rate=0.0) -> None:
+        super().__init__()
+        countlayer = sum(layers[:level])
+        expansion = 4
+        self.fusion = Fusion(level, channels, first_col)
+        modules = [ConvNextBlock(channels[level], expansion*channels[level], channels[level], kernel_size = kernel_size,  layer_scale_init_value=1e-6, drop_path=dp_rate[countlayer+i]) for i in range(layers[level])]
+        self.blocks = nn.Sequential(*modules)
+    def forward(self, *args):
+        x = self.fusion(*args)
+        x = self.blocks(x)
+        return x
+class SubNet(nn.Module):
+    def __init__(self, channels, layers, kernel_size, first_col, dp_rates, save_memory) -> None:
+        super().__init__()
+        shortcut_scale_init_value = 0.5
+        self.save_memory = save_memory
+        self.alpha0 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[0], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.alpha1 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[1], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.alpha2 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[2], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.alpha3 = nn.Parameter(shortcut_scale_init_value * torch.ones((1, channels[3], 1, 1)),
+                                    requires_grad=True) if shortcut_scale_init_value > 0 else None
+        self.level0 = Level(0, channels, layers, kernel_size, first_col, dp_rates)
+        self.level1 = Level(1, channels, layers, kernel_size, first_col, dp_rates)
+        self.level2 = Level(2, channels, layers, kernel_size,first_col, dp_rates)
+        self.level3 = Level(3, channels, layers, kernel_size, first_col, dp_rates)
+    def _forward_nonreverse(self, *args):
+        x, c0, c1, c2, c3= args
+        c0 = (self.alpha0)*c0 + self.level0(x, c1)
+        c1 = (self.alpha1)*c1 + self.level1(c0, c2)
+        c2 = (self.alpha2)*c2 + self.level2(c1, c3)
+        c3 = (self.alpha3)*c3 + self.level3(c2, None)
+        return c0, c1, c2, c3
+    def _forward_reverse(self, *args):
+        local_funs = [self.level0, self.level1, self.level2, self.level3]
+        alpha = [self.alpha0, self.alpha1, self.alpha2, self.alpha3]
+        _, c0, c1, c2, c3 = ReverseFunction.apply(
+            local_funs, alpha, *args)
+        return c0, c1, c2, c3
+    def forward(self, *args):
+        self._clamp_abs(self.alpha0.data, 1e-3)
+        self._clamp_abs(self.alpha1.data, 1e-3)
+        self._clamp_abs(self.alpha2.data, 1e-3)
+        self._clamp_abs(self.alpha3.data, 1e-3)
+        if self.save_memory:
+            return self._forward_reverse(*args)
+        else:
+            return self._forward_nonreverse(*args)
+    def _clamp_abs(self, data, value):
+        with torch.no_grad():
+            sign=data.sign()
+            data.abs_().clamp_(value)
+            data*=sign
+class Classifier(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(in_channels, eps=1e-6), # final norm layer
+            nn.Linear(in_channels, num_classes),
+        )
+    def forward(self, x):
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+class FullNet(nn.Module):
+    def __init__(self, channels=[32, 64, 96, 128], layers=[2, 3, 6, 3], num_subnet=5, kernel_size = 3, num_classes=1000, drop_path = 0.0, save_memory=True, inter_supv=True, head_init_scale=None) -> None:
+        super().__init__()
+        self.num_subnet = num_subnet
+        self.inter_supv = inter_supv
+        self.channels = channels
+        self.layers = layers
+        self.stem = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=4, stride=4),
+            LayerNorm(channels[0], eps=1e-6, data_format="channels_first")
+        )
+        dp_rate = [x.item() for x in torch.linspace(0, drop_path, sum(layers))]
+        for i in range(num_subnet):
+            first_col = True if i == 0 else False
+            self.add_module(f'subnet{str(i)}', SubNet(
+                channels,layers, kernel_size, first_col, dp_rates=dp_rate, save_memory=save_memory))
+        if not inter_supv:
+            self.cls = Classifier(in_channels=channels[-1], num_classes=num_classes)
+        else:
+            self.cls_blocks = nn.ModuleList([Classifier(in_channels=channels[-1], num_classes=num_classes) for _ in range(4) ])
+            if num_classes<=1000:
+                channels.reverse()
+                self.decoder_blocks = nn.ModuleList([Decoder(depth=[1,1,1,1], dim=channels, block_type=ConvNextBlock, kernel_size = 3) for _ in range(3) ])
+            else:
+                self.decoder_blocks = nn.ModuleList([SimDecoder(in_channel=channels[-1], encoder_stride=32) for _ in range(3) ])
+        self.apply(self._init_weights)
+        if head_init_scale:
+            print(f'Head_init_scale: {head_init_scale}')
+            self.cls.classifier._modules['1'].weight.data.mul_(head_init_scale)
+            self.cls.classifier._modules['1'].bias.data.mul_(head_init_scale)
+    def forward(self, x):
+        if self.inter_supv:
+            return self._forward_intermediate_supervision(x)
+        else:
+            c0, c1, c2, c3 = 0, 0, 0, 0
+            x = self.stem(x)
+            for i in range(self.num_subnet):
+                c0, c1, c2, c3 = getattr(self, f'subnet{str(i)}')(x, c0, c1, c2, c3)
+            return [self.cls(c3)], None
+    def _forward_intermediate_supervision(self, x):
+        x_cls_out = []
+        x_img_out = []
+        c0, c1, c2, c3 = 0, 0, 0, 0
+        interval = self.num_subnet//4
+        x = self.stem(x)
+        for i in range(self.num_subnet):
+            c0, c1, c2, c3 = getattr(self, f'subnet{str(i)}')(x, c0, c1, c2, c3)
+            if (i+1) % interval == 0:
+                x_cls_out.append(self.cls_blocks[i//interval](c3))
+                if i != self.num_subnet-1:
+                    x_img_out.append(self.decoder_blocks[i//interval](c3))
+        return x_cls_out, x_img_out
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            trunc_normal_(module.weight, std=.02)
+            nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.Linear):
+            trunc_normal_(module.weight, std=.02)
+            nn.init.constant_(module.bias, 0)
+##-------------------------------------- Tiny -----------------------------------------
+def revcol_tiny(save_memory, inter_supv=True, drop_path=0.1, num_classes=1000, kernel_size = 3):
+    channels = [64, 128, 256, 512]
+    layers = [2, 2, 4, 2]
+    num_subnet = 4
+    return FullNet(channels, layers, num_subnet, num_classes=num_classes, drop_path = drop_path, save_memory=save_memory, inter_supv=inter_supv, kernel_size=kernel_size)
+##-------------------------------------- Small -----------------------------------------
+def revcol_small(save_memory, inter_supv=True,  drop_path=0.3, num_classes=1000, kernel_size = 3):
+    channels = [64, 128, 256, 512]
+    layers = [2, 2, 4, 2]
+    num_subnet = 8
+    return FullNet(channels, layers, num_subnet, num_classes=num_classes, drop_path = drop_path, save_memory=save_memory, inter_supv=inter_supv, kernel_size=kernel_size)
+##-------------------------------------- Base -----------------------------------------
+def revcol_base(save_memory, inter_supv=True, drop_path=0.4, num_classes=1000, kernel_size = 3, head_init_scale=None):
+    channels = [72, 144, 288, 576]
+    layers = [1, 1, 3, 2]
+    num_subnet = 16
+    return FullNet(channels, layers, num_subnet, num_classes=num_classes, drop_path = drop_path, save_memory=save_memory, inter_supv=inter_supv, head_init_scale=head_init_scale, kernel_size=kernel_size)
+##-------------------------------------- Large -----------------------------------------
+def revcol_large(save_memory, inter_supv=True, drop_path=0.5, num_classes=1000, kernel_size = 3, head_init_scale=None):
+    channels = [128, 256, 512, 1024]
+    layers = [1, 2, 6, 2]
+    num_subnet = 8
+    return FullNet(channels, layers, num_subnet, num_classes=num_classes, drop_path = drop_path, save_memory=save_memory, inter_supv=inter_supv, head_init_scale=head_init_scale, kernel_size=kernel_size)
+##--------------------------------------Extra-Large -----------------------------------------
+def revcol_xlarge(save_memory, inter_supv=True, drop_path=0.5, num_classes=1000, kernel_size = 3, head_init_scale=None):
+    channels = [224, 448, 896, 1792]
+    layers = [1, 2, 6, 2]
+    num_subnet = 8
+    return FullNet(channels, layers, num_subnet, num_classes=num_classes, drop_path = drop_path, save_memory=save_memory, inter_supv=inter_supv, head_init_scale=head_init_scale, kernel_size=kernel_size)

training/models/revcol_function.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import torch
+from typing import Any, Iterable, List, Tuple, Callable
+import torch.distributed as dist
+def get_gpu_states(fwd_gpu_devices) -> Tuple[List[int], List[torch.Tensor]]:
+    # This will not error out if "arg" is a CPU tensor or a non-tensor type because
+    # the conditionals short-circuit.
+    fwd_gpu_states = []
+    for device in fwd_gpu_devices:
+        with torch.cuda.device(device):
+            fwd_gpu_states.append(torch.cuda.get_rng_state())
+    return fwd_gpu_states
+def get_gpu_device(*args):
+    fwd_gpu_devices = list(set(arg.get_device() for arg in args
+                               if isinstance(arg, torch.Tensor) and arg.is_cuda))
+    return fwd_gpu_devices
+def set_device_states(fwd_cpu_state, devices, states) -> None:
+    torch.set_rng_state(fwd_cpu_state)
+    for device, state in zip(devices, states):
+        with torch.cuda.device(device):
+            torch.cuda.set_rng_state(state)
+def detach_and_grad(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+            x = inp.detach()
+            x.requires_grad = True
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
+def get_cpu_and_gpu_states(gpu_devices):
+    return torch.get_rng_state(), get_gpu_states(gpu_devices)
+class ReverseFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_functions, alpha, *args):
+        l0, l1, l2, l3 = run_functions
+        alpha0, alpha1, alpha2, alpha3 = alpha
+        ctx.run_functions  = run_functions
+        ctx.alpha = alpha
+        ctx.preserve_rng_state = True
+        ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
+                                   "dtype": torch.get_autocast_gpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
+        ctx.cpu_autocast_kwargs = {"enabled": torch.is_autocast_cpu_enabled(),
+                                   "dtype": torch.get_autocast_cpu_dtype(),
+                                   "cache_enabled": torch.is_autocast_cache_enabled()}
+        assert len(args) == 5
+        [x, c0, c1, c2, c3] = args
+        if type(c0) == int:
+            ctx.first_col = True
+        else:
+            ctx.first_col = False
+        with torch.no_grad():
+            gpu_devices = get_gpu_device(*args)
+            ctx.gpu_devices = gpu_devices
+            ctx.cpu_states_0, ctx.gpu_states_0  = get_cpu_and_gpu_states(gpu_devices)
+            c0 = l0(x, c1) + c0*alpha0
+            ctx.cpu_states_1, ctx.gpu_states_1  = get_cpu_and_gpu_states(gpu_devices)
+            c1 = l1(c0, c2) + c1*alpha1
+            ctx.cpu_states_2, ctx.gpu_states_2  = get_cpu_and_gpu_states(gpu_devices)
+            c2 = l2(c1, c3) + c2*alpha2
+            ctx.cpu_states_3, ctx.gpu_states_3  = get_cpu_and_gpu_states(gpu_devices)
+            c3 = l3(c2, None) + c3*alpha3
+        ctx.save_for_backward(x, c0, c1, c2, c3)
+        return x, c0, c1 ,c2, c3
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        x, c0, c1, c2, c3 = ctx.saved_tensors
+        l0, l1, l2, l3 = ctx.run_functions
+        alpha0, alpha1, alpha2, alpha3 = ctx.alpha
+        gx_right, g0_right, g1_right, g2_right, g3_right = grad_outputs
+        (x, c0, c1, c2, c3) = detach_and_grad((x, c0, c1, c2, c3))
+        with torch.enable_grad(), \
+            torch.random.fork_rng(devices=ctx.gpu_devices, enabled=ctx.preserve_rng_state), \
+            torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs), \
+            torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):
+            g3_up = g3_right
+            g3_left = g3_up*alpha3 ##shortcut
+            set_device_states(ctx.cpu_states_3, ctx.gpu_devices, ctx.gpu_states_3)
+            oup3 = l3(c2, None)
+            torch.autograd.backward(oup3, g3_up, retain_graph=True)
+            with torch.no_grad():
+                c3_left = (1/alpha3)*(c3 - oup3) ## feature reverse
+            g2_up = g2_right+ c2.grad
+            g2_left = g2_up*alpha2 ##shortcut
+            (c3_left,) = detach_and_grad((c3_left,))
+            set_device_states(ctx.cpu_states_2, ctx.gpu_devices, ctx.gpu_states_2)
+            oup2 = l2(c1, c3_left)
+            torch.autograd.backward(oup2, g2_up, retain_graph=True)
+            c3_left.requires_grad = False
+            cout3 = c3_left*alpha3 ##alpha3 update
+            torch.autograd.backward(cout3, g3_up)
+            with torch.no_grad():
+                c2_left = (1/alpha2)*(c2 - oup2) ## feature reverse
+            g3_left = g3_left + c3_left.grad if c3_left.grad is not None else g3_left
+            g1_up = g1_right+c1.grad
+            g1_left = g1_up*alpha1 ##shortcut
+            (c2_left,) = detach_and_grad((c2_left,))
+            set_device_states(ctx.cpu_states_1, ctx.gpu_devices, ctx.gpu_states_1)
+            oup1 = l1(c0, c2_left)
+            torch.autograd.backward(oup1, g1_up, retain_graph=True)
+            c2_left.requires_grad = False
+            cout2 = c2_left*alpha2 ##alpha2 update
+            torch.autograd.backward(cout2, g2_up)
+            with torch.no_grad():
+                c1_left = (1/alpha1)*(c1 - oup1) ## feature reverse
+            g0_up = g0_right + c0.grad
+            g0_left = g0_up*alpha0 ##shortcut
+            g2_left = g2_left + c2_left.grad if c2_left.grad is not None else g2_left ## Fusion
+            (c1_left,) = detach_and_grad((c1_left,))
+            set_device_states(ctx.cpu_states_0, ctx.gpu_devices, ctx.gpu_states_0)
+            oup0 = l0(x, c1_left)
+            torch.autograd.backward(oup0, g0_up, retain_graph=True)
+            c1_left.requires_grad = False
+            cout1 = c1_left*alpha1 ##alpha1 update
+            torch.autograd.backward(cout1, g1_up)
+            with torch.no_grad():
+                c0_left = (1/alpha0)*(c0 - oup0) ## feature reverse
+            gx_up = x.grad ## Fusion
+            g1_left = g1_left + c1_left.grad if c1_left.grad is not None else g1_left ## Fusion
+            c0_left.requires_grad = False
+            cout0 = c0_left*alpha0 ##alpha0 update
+            torch.autograd.backward(cout0, g0_up)
+        if ctx.first_col:
+            return None, None, gx_up, None, None, None, None
+        else:
+            return None, None, gx_up, g0_left, g1_left, g2_left, g3_left

training/optimizer.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import numpy as np
+from torch import optim as optim
+def build_optimizer(config, model):
+    """
+    Build optimizer, set weight decay of normalization to 0 by default.
+    """
+    skip = {}
+    skip_keywords = {}
+    if hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    if hasattr(model, 'no_weight_decay_keywords'):
+        skip_keywords = model.no_weight_decay_keywords()
+    elif config.MODEL.TYPE.startswith("revcol"):
+        parameters = param_groups_lrd(model, weight_decay=config.TRAIN.WEIGHT_DECAY, no_weight_decay_list=[], layer_decay=config.TRAIN.OPTIMIZER.LAYER_DECAY)
+    else:
+        parameters = set_weight_decay(model, skip, skip_keywords)
+    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
+    optimizer = None
+    if opt_lower == 'sgd':
+        optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
+                              lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+    elif opt_lower == 'adamw':
+        optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
+                                lr=config.TRAIN.BASE_LR)
+    return optimizer
+def set_weight_decay(model, skip_list=(), skip_keywords=()):
+    has_decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad or name in ["linear_eval.weight", "linear_eval.bias"]:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
+                check_keywords_in_name(name, skip_keywords):
+            no_decay.append(param)
+            # print(f"{name} has no weight decay")
+        else:
+            has_decay.append(param)
+    return [{'params': has_decay},
+            {'params': no_decay, 'weight_decay': 0.}]
+def check_keywords_in_name(name, keywords=()):
+    isin = False
+    for keyword in keywords:
+        if keyword in name:
+            isin = True
+    return isin
+def cal_model_depth(columns, layers):
+    depth = sum(layers)
+    dp = np.zeros((depth, columns))
+    dp[:,0]=np.linspace(0, depth-1, depth)
+    dp[0,:]=np.linspace(0, columns-1, columns)
+    for i in range(1, depth):
+        for j in range(1, columns):
+            dp[i][j] = min(dp[i][j-1], dp[i-1][j])+1
+    dp = dp.astype(int)
+    return dp
+def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75):
+    """
+    Parameter groups for layer-wise lr decay
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
+    """
+    param_group_names = {}
+    param_groups = {}
+    dp = cal_model_depth(model.num_subnet, model.layers)+1
+    num_layers = dp[-1][-1] + 1
+    layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        # no decay: all 1D parameters and model specific ones
+        if p.ndim == 1 or n in no_weight_decay_list:# or re.match('(.*).alpha.$', n):
+            g_decay = "no_decay"
+            this_decay = 0.
+        else:
+            g_decay = "decay"
+            this_decay = weight_decay
+        layer_id = get_layer_id(n, dp, model.layers)
+        group_name = "layer_%d_%s" % (layer_id, g_decay)
+        if group_name not in param_group_names:
+            this_scale = layer_scales[layer_id]
+            param_group_names[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+            param_groups[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+        param_group_names[group_name]["params"].append(n)
+        param_groups[group_name]["params"].append(p)
+    # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
+    return list(param_groups.values())
+def get_layer_id(n, dp, layers):
+    if n.startswith("subnet"):
+        name_part = n.split('.')
+        subnet = int(name_part[0][6:])
+        if name_part[1].startswith("alpha"):
+            id = dp[0][subnet]
+        else:
+            level = int(name_part[1][-1])
+            if name_part[2].startswith("blocks"):
+                sub = int(name_part[3])
+                if sub>layers[level]-1:
+                    sub = layers[level]-1
+                block = sum(layers[:level])+sub
+            if name_part[2].startswith("fusion"):
+                block = sum(layers[:level])
+            id = dp[block][subnet]
+    elif n.startswith("stem"):
+        id = 0
+    else:
+        id = dp[-1][-1]+1
+    return id

training/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fvcore==0.1.5.post20211023
+numpy==1.20.3
+opencv_python==4.4.0.46
+termcolor==1.1.0
+timm==0.5.4
+yacs==0.1.8
+tensorboard

training/utils.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# --------------------------------------------------------
+# Reversible Column Networks
+# Copyright (c) 2022 Megvii Inc.
+# Licensed under The Apache License 2.0 [see LICENSE for details]
+# Written by Yuxuan Cai
+# --------------------------------------------------------
+import io
+import os
+import re
+from typing import List
+from timm.utils.model_ema import ModelEma
+import torch
+import torch.distributed as dist
+from timm.utils import get_state_dict
+import subprocess
+def load_checkpoint(config, model, optimizer, logger, model_ema=None):
+    logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
+    if config.MODEL.RESUME.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(
+            config.MODEL.RESUME, map_location='cpu', check_hash=True)
+    else:
+        checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+        logger.info("Already loaded checkpoint to memory..")
+    msg = model.load_state_dict(checkpoint['model'], strict=False)
+    logger.info(msg)
+    max_accuracy = 0.0
+    if config.MODEL_EMA:
+        if 'state_dict_ema' in checkpoint.keys():
+            model_ema.ema.load_state_dict(checkpoint['state_dict_ema'], strict=False)
+            logger.info("Loaded state_dict_ema")
+        else:
+            model_ema.ema.load_state_dict(checkpoint['model'], strict=False)
+            logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
+    if not config.EVAL_MODE and 'optimizer' in checkpoint and 'epoch' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        config.defrost()
+        config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
+        config.freeze()
+        logger.info(f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})")
+        if 'max_accuracy' in checkpoint:
+            max_accuracy = checkpoint['max_accuracy']
+    # del checkpoint
+    # torch.cuda.empty_cache()
+    return max_accuracy
+def load_checkpoint_finetune(config, model, logger, model_ema=None):
+    logger.info(f"==============> Finetune {config.MODEL.FINETUNE}....................")
+    checkpoint = torch.load(config.MODEL.FINETUNE, map_location='cpu')['model']
+    converted_weights = {}
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if re.match(r'cls.*', key):
+        # if re.match(r'cls.classifier.1.*', key):
+            print(f'key: {key} is used for pretrain, discarded.')
+            continue
+        else:
+            converted_weights[key] = checkpoint[key]
+    msg = model.load_state_dict(converted_weights, strict=False)
+    logger.info(msg)
+    if model_ema is not None:
+        ema_msg = model_ema.ema.load_state_dict(converted_weights, strict=False)
+        logger.info(f"==============> Loaded Pretraind statedict into EMA....................")
+        logger.info(ema_msg)
+    del checkpoint
+    torch.cuda.empty_cache()
+def save_checkpoint(config, epoch, model, epoch_accuracy, max_accuracy, optimizer, logger, model_ema=None):
+    if model_ema is not None:
+        logger.info("Model EMA is not None...")
+        save_state = {'model': model.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'max_accuracy': max(max_accuracy, epoch_accuracy),
+                    'epoch': epoch,
+                    'state_dict_ema': get_state_dict(model_ema),
+                    'input': input,
+                    'config': config}
+    else:
+        save_state = {'model': model.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'max_accuracy': max(max_accuracy, epoch_accuracy),
+                    'epoch': epoch,
+                    'state_dict_ema': None,
+                    'input': input,
+                    'config': config}
+    save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth')
+    best_path = os.path.join(config.OUTPUT, f'best.pth')
+    logger.info(f"{save_path} saving......")
+    torch.save(save_state, save_path)
+    if epoch_accuracy>max_accuracy:
+        torch.save(save_state, best_path)
+    logger.info(f"{save_path} saved !!!")
+def get_grad_norm(parameters, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+    total_norm = total_norm ** (1. / norm_type)
+    return total_norm
+def auto_resume_helper(output_dir,logger):
+    checkpoints = os.listdir(output_dir)
+    checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth') and ckpt.startswith('ckpt_')]
+    logger.info(f"All checkpoints founded in {output_dir}: {checkpoints}")
+    if len(checkpoints) > 0:
+        latest_checkpoint = max([os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime)
+        logger.info(f"The latest checkpoint founded: {latest_checkpoint}")
+        resume_file = latest_checkpoint
+    else:
+        resume_file = None
+    return resume_file
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= dist.get_world_size()
+    return rt
+def denormalize(tensor: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
+    """Denormalize a float tensor image with mean and standard deviation.
+    This transform does not support PIL Image.
+    .. note::
+        This transform acts out of place by default, i.e., it does not mutates the input tensor.
+    See :class:`~torchvision.transforms.Normalize` for more details.
+    Args:
+        tensor (Tensor): Float tensor image of size (C, H, W) or (B, C, H, W) to be normalized.
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation inplace.
+    Returns:
+        Tensor: Denormalized Tensor image.
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError('Input tensor should be a torch tensor. Got {}.'.format(type(tensor)))
+    if not tensor.is_floating_point():
+        raise TypeError('Input tensor should be a float tensor. Got {}.'.format(tensor.dtype))
+    if tensor.ndim < 3:
+        raise ValueError('Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = '
+                         '{}.'.format(tensor.size()))
+    if not inplace:
+        tensor = tensor.clone()
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if (std == 0).any():
+        raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype))
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+    tensor.mul_(std).add_(mean).clip_(0.0, 1.0)
+    return tensor