SakuraD commited on
Commit
210b510
1 Parent(s): b5137ad
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +3 -3
  2. app.py +62 -0
  3. configs/_base_/datasets/cityscapes_detection.py +55 -0
  4. configs/_base_/datasets/cityscapes_instance.py +55 -0
  5. configs/_base_/datasets/coco_detection.py +48 -0
  6. configs/_base_/datasets/coco_instance.py +48 -0
  7. configs/_base_/datasets/coco_instance_semantic.py +53 -0
  8. configs/_base_/datasets/deepfashion.py +53 -0
  9. configs/_base_/datasets/lvis_v0.5_instance.py +23 -0
  10. configs/_base_/datasets/lvis_v1_instance.py +23 -0
  11. configs/_base_/datasets/voc0712.py +55 -0
  12. configs/_base_/datasets/wider_face.py +63 -0
  13. configs/_base_/default_runtime.py +16 -0
  14. configs/_base_/models/cascade_mask_rcnn_r50_fpn.py +196 -0
  15. configs/_base_/models/cascade_mask_rcnn_swin_fpn.py +207 -0
  16. configs/_base_/models/cascade_mask_rcnn_uniformer_fpn.py +201 -0
  17. configs/_base_/models/cascade_rcnn_r50_fpn.py +179 -0
  18. configs/_base_/models/fast_rcnn_r50_fpn.py +62 -0
  19. configs/_base_/models/faster_rcnn_r50_caffe_c4.py +112 -0
  20. configs/_base_/models/faster_rcnn_r50_caffe_dc5.py +103 -0
  21. configs/_base_/models/faster_rcnn_r50_fpn.py +107 -0
  22. configs/_base_/models/mask_rcnn_r50_caffe_c4.py +123 -0
  23. configs/_base_/models/mask_rcnn_r50_fpn.py +120 -0
  24. configs/_base_/models/mask_rcnn_swin_fpn.py +127 -0
  25. configs/_base_/models/mask_rcnn_uniformer_fpn.py +121 -0
  26. configs/_base_/models/retinanet_r50_fpn.py +60 -0
  27. configs/_base_/models/rpn_r50_caffe_c4.py +56 -0
  28. configs/_base_/models/rpn_r50_fpn.py +59 -0
  29. configs/_base_/models/ssd300.py +50 -0
  30. configs/_base_/schedules/schedule_1x.py +11 -0
  31. configs/_base_/schedules/schedule_20e.py +11 -0
  32. configs/_base_/schedules/schedule_2x.py +11 -0
  33. configs/albu_example/README.md +19 -0
  34. configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py +73 -0
  35. configs/atss/README.md +21 -0
  36. configs/atss/atss_r101_fpn_1x_coco.py +5 -0
  37. configs/atss/atss_r50_fpn_1x_coco.py +62 -0
  38. configs/carafe/README.md +32 -0
  39. configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py +50 -0
  40. configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py +60 -0
  41. configs/cascade_rcnn/README.md +55 -0
  42. configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py +4 -0
  43. configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py +2 -0
  44. configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py +2 -0
  45. configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py +38 -0
  46. configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py +5 -0
  47. configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py +5 -0
  48. configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py +13 -0
  49. configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py +13 -0
  50. configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py +13 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Uniformer_image_detection
3
- emoji: 💻
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.0.4
8
  app_file: app.py
 
1
  ---
2
  title: Uniformer_image_detection
3
+ emoji: 🌍
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.0.4
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import torchvision.transforms as T
6
+ from mmdet.apis import init_detector, inference_detector, show_result_pyplot
7
+ import mmcv
8
+
9
+ import gradio as gr
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ # Device on which to run the model
13
+ # Set to cuda to load on GPU
14
+ device = "cpu"
15
+ checkpoint_file = hf_hub_download(repo_id="Andy1621/uniformer", filename="mask_rcnn_3x_ms_hybrid_small.pth")
16
+ config_file = './exp/mask_rcnn_3x_ms_hybrid_small/config.py'
17
+ # init detector
18
+ # build the model from a config file and a checkpoint file
19
+ model = init_detector(config_file, checkpoint_file, device='cpu')
20
+
21
+
22
+ def set_example_image(example: list) -> dict:
23
+ return gr.Image.update(value=example[0])
24
+
25
+
26
+ def inference(img):
27
+ result = inference_detector(model, img)
28
+ res_img = show_result_pyplot(model, img, result)
29
+ return res_img
30
+
31
+
32
+ demo = gr.Blocks()
33
+ with demo:
34
+ gr.Markdown(
35
+ """
36
+ # UniFormer-S
37
+ Gradio demo for <a href='https://github.com/Sense-X/UniFormer' target='_blank'>UniFormer</a>: To use it, simply upload your image, or click one of the examples to load them. Read more at the links below.
38
+ """
39
+ )
40
+
41
+ with gr.Box():
42
+ with gr.Row():
43
+ with gr.Column():
44
+ with gr.Row():
45
+ input_image = gr.Image(label='Input Image', type='numpy')
46
+ with gr.Row():
47
+ submit_button = gr.Button('Submit')
48
+ with gr.Column():
49
+ res_image = gr.Image(type='numpy', label='Detection Resutls')
50
+ with gr.Row():
51
+ example_images = gr.Dataset(components=[input_image], samples=[['demo.jpg']])
52
+
53
+ gr.Markdown(
54
+ """
55
+ <p style='text-align: center'><a href='https://arxiv.org/abs/2201.09450' target='_blank'>UniFormer: Unifying Convolution and Self-attention for Visual Recognition</a> | <a href='https://github.com/Sense-X/UniFormer' target='_blank'>Github Repo</a></p>
56
+ """
57
+ )
58
+
59
+ submit_button.click(fn=inference, inputs=input_image, outputs=res_image)
60
+ example_images.click(fn=set_example_image, inputs=example_images, outputs=example_images.components)
61
+
62
+ demo.launch(enable_queue=True)
configs/_base_/datasets/cityscapes_detection.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'CityscapesDataset'
2
+ data_root = 'data/cityscapes/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations', with_bbox=True),
8
+ dict(
9
+ type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
10
+ dict(type='RandomFlip', flip_ratio=0.5),
11
+ dict(type='Normalize', **img_norm_cfg),
12
+ dict(type='Pad', size_divisor=32),
13
+ dict(type='DefaultFormatBundle'),
14
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
15
+ ]
16
+ test_pipeline = [
17
+ dict(type='LoadImageFromFile'),
18
+ dict(
19
+ type='MultiScaleFlipAug',
20
+ img_scale=(2048, 1024),
21
+ flip=False,
22
+ transforms=[
23
+ dict(type='Resize', keep_ratio=True),
24
+ dict(type='RandomFlip'),
25
+ dict(type='Normalize', **img_norm_cfg),
26
+ dict(type='Pad', size_divisor=32),
27
+ dict(type='ImageToTensor', keys=['img']),
28
+ dict(type='Collect', keys=['img']),
29
+ ])
30
+ ]
31
+ data = dict(
32
+ samples_per_gpu=1,
33
+ workers_per_gpu=2,
34
+ train=dict(
35
+ type='RepeatDataset',
36
+ times=8,
37
+ dataset=dict(
38
+ type=dataset_type,
39
+ ann_file=data_root +
40
+ 'annotations/instancesonly_filtered_gtFine_train.json',
41
+ img_prefix=data_root + 'leftImg8bit/train/',
42
+ pipeline=train_pipeline)),
43
+ val=dict(
44
+ type=dataset_type,
45
+ ann_file=data_root +
46
+ 'annotations/instancesonly_filtered_gtFine_val.json',
47
+ img_prefix=data_root + 'leftImg8bit/val/',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ ann_file=data_root +
52
+ 'annotations/instancesonly_filtered_gtFine_test.json',
53
+ img_prefix=data_root + 'leftImg8bit/test/',
54
+ pipeline=test_pipeline))
55
+ evaluation = dict(interval=1, metric='bbox')
configs/_base_/datasets/cityscapes_instance.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'CityscapesDataset'
2
+ data_root = 'data/cityscapes/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
8
+ dict(
9
+ type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
10
+ dict(type='RandomFlip', flip_ratio=0.5),
11
+ dict(type='Normalize', **img_norm_cfg),
12
+ dict(type='Pad', size_divisor=32),
13
+ dict(type='DefaultFormatBundle'),
14
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
15
+ ]
16
+ test_pipeline = [
17
+ dict(type='LoadImageFromFile'),
18
+ dict(
19
+ type='MultiScaleFlipAug',
20
+ img_scale=(2048, 1024),
21
+ flip=False,
22
+ transforms=[
23
+ dict(type='Resize', keep_ratio=True),
24
+ dict(type='RandomFlip'),
25
+ dict(type='Normalize', **img_norm_cfg),
26
+ dict(type='Pad', size_divisor=32),
27
+ dict(type='ImageToTensor', keys=['img']),
28
+ dict(type='Collect', keys=['img']),
29
+ ])
30
+ ]
31
+ data = dict(
32
+ samples_per_gpu=1,
33
+ workers_per_gpu=2,
34
+ train=dict(
35
+ type='RepeatDataset',
36
+ times=8,
37
+ dataset=dict(
38
+ type=dataset_type,
39
+ ann_file=data_root +
40
+ 'annotations/instancesonly_filtered_gtFine_train.json',
41
+ img_prefix=data_root + 'leftImg8bit/train/',
42
+ pipeline=train_pipeline)),
43
+ val=dict(
44
+ type=dataset_type,
45
+ ann_file=data_root +
46
+ 'annotations/instancesonly_filtered_gtFine_val.json',
47
+ img_prefix=data_root + 'leftImg8bit/val/',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ ann_file=data_root +
52
+ 'annotations/instancesonly_filtered_gtFine_test.json',
53
+ img_prefix=data_root + 'leftImg8bit/test/',
54
+ pipeline=test_pipeline))
55
+ evaluation = dict(metric=['bbox', 'segm'])
configs/_base_/datasets/coco_detection.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'CocoDataset'
2
+ data_root = 'data/coco/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations', with_bbox=True),
8
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
9
+ dict(type='RandomFlip', flip_ratio=0.5),
10
+ dict(type='Normalize', **img_norm_cfg),
11
+ dict(type='Pad', size_divisor=32),
12
+ dict(type='DefaultFormatBundle'),
13
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
14
+ ]
15
+ test_pipeline = [
16
+ dict(type='LoadImageFromFile'),
17
+ dict(
18
+ type='MultiScaleFlipAug',
19
+ img_scale=(1333, 800),
20
+ flip=False,
21
+ transforms=[
22
+ dict(type='Resize', keep_ratio=True),
23
+ dict(type='RandomFlip'),
24
+ dict(type='Normalize', **img_norm_cfg),
25
+ dict(type='Pad', size_divisor=32),
26
+ dict(type='ImageToTensor', keys=['img']),
27
+ dict(type='Collect', keys=['img']),
28
+ ])
29
+ ]
30
+ data = dict(
31
+ samples_per_gpu=2,
32
+ workers_per_gpu=2,
33
+ train=dict(
34
+ type=dataset_type,
35
+ ann_file=data_root + 'annotations/instances_train2017.json',
36
+ img_prefix=data_root + 'train2017/',
37
+ pipeline=train_pipeline),
38
+ val=dict(
39
+ type=dataset_type,
40
+ ann_file=data_root + 'annotations/instances_val2017.json',
41
+ img_prefix=data_root + 'val2017/',
42
+ pipeline=test_pipeline),
43
+ test=dict(
44
+ type=dataset_type,
45
+ ann_file=data_root + 'annotations/instances_val2017.json',
46
+ img_prefix=data_root + 'val2017/',
47
+ pipeline=test_pipeline))
48
+ evaluation = dict(interval=1, metric='bbox')
configs/_base_/datasets/coco_instance.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'CocoDataset'
2
+ data_root = 'data/coco/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
8
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
9
+ dict(type='RandomFlip', flip_ratio=0.5),
10
+ dict(type='Normalize', **img_norm_cfg),
11
+ dict(type='Pad', size_divisor=32),
12
+ dict(type='DefaultFormatBundle'),
13
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
14
+ ]
15
+ test_pipeline = [
16
+ dict(type='LoadImageFromFile'),
17
+ dict(
18
+ type='MultiScaleFlipAug',
19
+ img_scale=(1333, 800),
20
+ flip=False,
21
+ transforms=[
22
+ dict(type='Resize', keep_ratio=True),
23
+ dict(type='RandomFlip'),
24
+ dict(type='Normalize', **img_norm_cfg),
25
+ dict(type='Pad', size_divisor=32),
26
+ dict(type='ImageToTensor', keys=['img']),
27
+ dict(type='Collect', keys=['img']),
28
+ ])
29
+ ]
30
+ data = dict(
31
+ samples_per_gpu=2,
32
+ workers_per_gpu=2,
33
+ train=dict(
34
+ type=dataset_type,
35
+ ann_file=data_root + 'annotations/instances_train2017.json',
36
+ img_prefix=data_root + 'train2017/',
37
+ pipeline=train_pipeline),
38
+ val=dict(
39
+ type=dataset_type,
40
+ ann_file=data_root + 'annotations/instances_val2017.json',
41
+ img_prefix=data_root + 'val2017/',
42
+ pipeline=test_pipeline),
43
+ test=dict(
44
+ type=dataset_type,
45
+ ann_file=data_root + 'annotations/instances_val2017.json',
46
+ img_prefix=data_root + 'val2017/',
47
+ pipeline=test_pipeline))
48
+ evaluation = dict(metric=['bbox', 'segm'])
configs/_base_/datasets/coco_instance_semantic.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'CocoDataset'
2
+ data_root = 'data/coco/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(
8
+ type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
9
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
10
+ dict(type='RandomFlip', flip_ratio=0.5),
11
+ dict(type='Normalize', **img_norm_cfg),
12
+ dict(type='Pad', size_divisor=32),
13
+ dict(type='SegRescale', scale_factor=1 / 8),
14
+ dict(type='DefaultFormatBundle'),
15
+ dict(
16
+ type='Collect',
17
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(1333, 800),
24
+ flip=False,
25
+ transforms=[
26
+ dict(type='Resize', keep_ratio=True),
27
+ dict(type='RandomFlip', flip_ratio=0.5),
28
+ dict(type='Normalize', **img_norm_cfg),
29
+ dict(type='Pad', size_divisor=32),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=2,
36
+ workers_per_gpu=2,
37
+ train=dict(
38
+ type=dataset_type,
39
+ ann_file=data_root + 'annotations/instances_train2017.json',
40
+ img_prefix=data_root + 'train2017/',
41
+ seg_prefix=data_root + 'stuffthingmaps/train2017/',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ ann_file=data_root + 'annotations/instances_val2017.json',
46
+ img_prefix=data_root + 'val2017/',
47
+ pipeline=test_pipeline),
48
+ test=dict(
49
+ type=dataset_type,
50
+ ann_file=data_root + 'annotations/instances_val2017.json',
51
+ img_prefix=data_root + 'val2017/',
52
+ pipeline=test_pipeline))
53
+ evaluation = dict(metric=['bbox', 'segm'])
configs/_base_/datasets/deepfashion.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'DeepFashionDataset'
3
+ data_root = 'data/DeepFashion/In-shop/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ train_pipeline = [
7
+ dict(type='LoadImageFromFile'),
8
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
9
+ dict(type='Resize', img_scale=(750, 1101), keep_ratio=True),
10
+ dict(type='RandomFlip', flip_ratio=0.5),
11
+ dict(type='Normalize', **img_norm_cfg),
12
+ dict(type='Pad', size_divisor=32),
13
+ dict(type='DefaultFormatBundle'),
14
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
15
+ ]
16
+ test_pipeline = [
17
+ dict(type='LoadImageFromFile'),
18
+ dict(
19
+ type='MultiScaleFlipAug',
20
+ img_scale=(750, 1101),
21
+ flip=False,
22
+ transforms=[
23
+ dict(type='Resize', keep_ratio=True),
24
+ dict(type='RandomFlip'),
25
+ dict(type='Normalize', **img_norm_cfg),
26
+ dict(type='Pad', size_divisor=32),
27
+ dict(type='ImageToTensor', keys=['img']),
28
+ dict(type='Collect', keys=['img']),
29
+ ])
30
+ ]
31
+ data = dict(
32
+ imgs_per_gpu=2,
33
+ workers_per_gpu=1,
34
+ train=dict(
35
+ type=dataset_type,
36
+ ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
37
+ img_prefix=data_root + 'Img/',
38
+ pipeline=train_pipeline,
39
+ data_root=data_root),
40
+ val=dict(
41
+ type=dataset_type,
42
+ ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
43
+ img_prefix=data_root + 'Img/',
44
+ pipeline=test_pipeline,
45
+ data_root=data_root),
46
+ test=dict(
47
+ type=dataset_type,
48
+ ann_file=data_root +
49
+ 'annotations/DeepFashion_segmentation_gallery.json',
50
+ img_prefix=data_root + 'Img/',
51
+ pipeline=test_pipeline,
52
+ data_root=data_root))
53
+ evaluation = dict(interval=5, metric=['bbox', 'segm'])
configs/_base_/datasets/lvis_v0.5_instance.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = 'coco_instance.py'
2
+ dataset_type = 'LVISV05Dataset'
3
+ data_root = 'data/lvis_v0.5/'
4
+ data = dict(
5
+ samples_per_gpu=2,
6
+ workers_per_gpu=2,
7
+ train=dict(
8
+ _delete_=True,
9
+ type='ClassBalancedDataset',
10
+ oversample_thr=1e-3,
11
+ dataset=dict(
12
+ type=dataset_type,
13
+ ann_file=data_root + 'annotations/lvis_v0.5_train.json',
14
+ img_prefix=data_root + 'train2017/')),
15
+ val=dict(
16
+ type=dataset_type,
17
+ ann_file=data_root + 'annotations/lvis_v0.5_val.json',
18
+ img_prefix=data_root + 'val2017/'),
19
+ test=dict(
20
+ type=dataset_type,
21
+ ann_file=data_root + 'annotations/lvis_v0.5_val.json',
22
+ img_prefix=data_root + 'val2017/'))
23
+ evaluation = dict(metric=['bbox', 'segm'])
configs/_base_/datasets/lvis_v1_instance.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = 'coco_instance.py'
2
+ dataset_type = 'LVISV1Dataset'
3
+ data_root = 'data/lvis_v1/'
4
+ data = dict(
5
+ samples_per_gpu=2,
6
+ workers_per_gpu=2,
7
+ train=dict(
8
+ _delete_=True,
9
+ type='ClassBalancedDataset',
10
+ oversample_thr=1e-3,
11
+ dataset=dict(
12
+ type=dataset_type,
13
+ ann_file=data_root + 'annotations/lvis_v1_train.json',
14
+ img_prefix=data_root)),
15
+ val=dict(
16
+ type=dataset_type,
17
+ ann_file=data_root + 'annotations/lvis_v1_val.json',
18
+ img_prefix=data_root),
19
+ test=dict(
20
+ type=dataset_type,
21
+ ann_file=data_root + 'annotations/lvis_v1_val.json',
22
+ img_prefix=data_root))
23
+ evaluation = dict(metric=['bbox', 'segm'])
configs/_base_/datasets/voc0712.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'VOCDataset'
3
+ data_root = 'data/VOCdevkit/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ train_pipeline = [
7
+ dict(type='LoadImageFromFile'),
8
+ dict(type='LoadAnnotations', with_bbox=True),
9
+ dict(type='Resize', img_scale=(1000, 600), keep_ratio=True),
10
+ dict(type='RandomFlip', flip_ratio=0.5),
11
+ dict(type='Normalize', **img_norm_cfg),
12
+ dict(type='Pad', size_divisor=32),
13
+ dict(type='DefaultFormatBundle'),
14
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
15
+ ]
16
+ test_pipeline = [
17
+ dict(type='LoadImageFromFile'),
18
+ dict(
19
+ type='MultiScaleFlipAug',
20
+ img_scale=(1000, 600),
21
+ flip=False,
22
+ transforms=[
23
+ dict(type='Resize', keep_ratio=True),
24
+ dict(type='RandomFlip'),
25
+ dict(type='Normalize', **img_norm_cfg),
26
+ dict(type='Pad', size_divisor=32),
27
+ dict(type='ImageToTensor', keys=['img']),
28
+ dict(type='Collect', keys=['img']),
29
+ ])
30
+ ]
31
+ data = dict(
32
+ samples_per_gpu=2,
33
+ workers_per_gpu=2,
34
+ train=dict(
35
+ type='RepeatDataset',
36
+ times=3,
37
+ dataset=dict(
38
+ type=dataset_type,
39
+ ann_file=[
40
+ data_root + 'VOC2007/ImageSets/Main/trainval.txt',
41
+ data_root + 'VOC2012/ImageSets/Main/trainval.txt'
42
+ ],
43
+ img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
44
+ pipeline=train_pipeline)),
45
+ val=dict(
46
+ type=dataset_type,
47
+ ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
48
+ img_prefix=data_root + 'VOC2007/',
49
+ pipeline=test_pipeline),
50
+ test=dict(
51
+ type=dataset_type,
52
+ ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
53
+ img_prefix=data_root + 'VOC2007/',
54
+ pipeline=test_pipeline))
55
+ evaluation = dict(interval=1, metric='mAP')
configs/_base_/datasets/wider_face.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'WIDERFaceDataset'
3
+ data_root = 'data/WIDERFace/'
4
+ img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile', to_float32=True),
7
+ dict(type='LoadAnnotations', with_bbox=True),
8
+ dict(
9
+ type='PhotoMetricDistortion',
10
+ brightness_delta=32,
11
+ contrast_range=(0.5, 1.5),
12
+ saturation_range=(0.5, 1.5),
13
+ hue_delta=18),
14
+ dict(
15
+ type='Expand',
16
+ mean=img_norm_cfg['mean'],
17
+ to_rgb=img_norm_cfg['to_rgb'],
18
+ ratio_range=(1, 4)),
19
+ dict(
20
+ type='MinIoURandomCrop',
21
+ min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
22
+ min_crop_size=0.3),
23
+ dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
24
+ dict(type='Normalize', **img_norm_cfg),
25
+ dict(type='RandomFlip', flip_ratio=0.5),
26
+ dict(type='DefaultFormatBundle'),
27
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
28
+ ]
29
+ test_pipeline = [
30
+ dict(type='LoadImageFromFile'),
31
+ dict(
32
+ type='MultiScaleFlipAug',
33
+ img_scale=(300, 300),
34
+ flip=False,
35
+ transforms=[
36
+ dict(type='Resize', keep_ratio=False),
37
+ dict(type='Normalize', **img_norm_cfg),
38
+ dict(type='ImageToTensor', keys=['img']),
39
+ dict(type='Collect', keys=['img']),
40
+ ])
41
+ ]
42
+ data = dict(
43
+ samples_per_gpu=60,
44
+ workers_per_gpu=2,
45
+ train=dict(
46
+ type='RepeatDataset',
47
+ times=2,
48
+ dataset=dict(
49
+ type=dataset_type,
50
+ ann_file=data_root + 'train.txt',
51
+ img_prefix=data_root + 'WIDER_train/',
52
+ min_size=17,
53
+ pipeline=train_pipeline)),
54
+ val=dict(
55
+ type=dataset_type,
56
+ ann_file=data_root + 'val.txt',
57
+ img_prefix=data_root + 'WIDER_val/',
58
+ pipeline=test_pipeline),
59
+ test=dict(
60
+ type=dataset_type,
61
+ ann_file=data_root + 'val.txt',
62
+ img_prefix=data_root + 'WIDER_val/',
63
+ pipeline=test_pipeline))
configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_config = dict(interval=1)
2
+ # yapf:disable
3
+ log_config = dict(
4
+ interval=50,
5
+ hooks=[
6
+ dict(type='TextLoggerHook'),
7
+ # dict(type='TensorboardLoggerHook')
8
+ ])
9
+ # yapf:enable
10
+ custom_hooks = [dict(type='NumClassCheckHook')]
11
+
12
+ dist_params = dict(backend='nccl')
13
+ log_level = 'INFO'
14
+ load_from = None
15
+ resume_from = None
16
+ workflow = [('train', 1)]
configs/_base_/models/cascade_mask_rcnn_r50_fpn.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='CascadeRCNN',
4
+ pretrained='torchvision://resnet50',
5
+ backbone=dict(
6
+ type='ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ rpn_head=dict(
20
+ type='RPNHead',
21
+ in_channels=256,
22
+ feat_channels=256,
23
+ anchor_generator=dict(
24
+ type='AnchorGenerator',
25
+ scales=[8],
26
+ ratios=[0.5, 1.0, 2.0],
27
+ strides=[4, 8, 16, 32, 64]),
28
+ bbox_coder=dict(
29
+ type='DeltaXYWHBBoxCoder',
30
+ target_means=[.0, .0, .0, .0],
31
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
32
+ loss_cls=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
35
+ roi_head=dict(
36
+ type='CascadeRoIHead',
37
+ num_stages=3,
38
+ stage_loss_weights=[1, 0.5, 0.25],
39
+ bbox_roi_extractor=dict(
40
+ type='SingleRoIExtractor',
41
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
42
+ out_channels=256,
43
+ featmap_strides=[4, 8, 16, 32]),
44
+ bbox_head=[
45
+ dict(
46
+ type='Shared2FCBBoxHead',
47
+ in_channels=256,
48
+ fc_out_channels=1024,
49
+ roi_feat_size=7,
50
+ num_classes=80,
51
+ bbox_coder=dict(
52
+ type='DeltaXYWHBBoxCoder',
53
+ target_means=[0., 0., 0., 0.],
54
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
55
+ reg_class_agnostic=True,
56
+ loss_cls=dict(
57
+ type='CrossEntropyLoss',
58
+ use_sigmoid=False,
59
+ loss_weight=1.0),
60
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
61
+ loss_weight=1.0)),
62
+ dict(
63
+ type='Shared2FCBBoxHead',
64
+ in_channels=256,
65
+ fc_out_channels=1024,
66
+ roi_feat_size=7,
67
+ num_classes=80,
68
+ bbox_coder=dict(
69
+ type='DeltaXYWHBBoxCoder',
70
+ target_means=[0., 0., 0., 0.],
71
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
72
+ reg_class_agnostic=True,
73
+ loss_cls=dict(
74
+ type='CrossEntropyLoss',
75
+ use_sigmoid=False,
76
+ loss_weight=1.0),
77
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
78
+ loss_weight=1.0)),
79
+ dict(
80
+ type='Shared2FCBBoxHead',
81
+ in_channels=256,
82
+ fc_out_channels=1024,
83
+ roi_feat_size=7,
84
+ num_classes=80,
85
+ bbox_coder=dict(
86
+ type='DeltaXYWHBBoxCoder',
87
+ target_means=[0., 0., 0., 0.],
88
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
89
+ reg_class_agnostic=True,
90
+ loss_cls=dict(
91
+ type='CrossEntropyLoss',
92
+ use_sigmoid=False,
93
+ loss_weight=1.0),
94
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
95
+ ],
96
+ mask_roi_extractor=dict(
97
+ type='SingleRoIExtractor',
98
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
99
+ out_channels=256,
100
+ featmap_strides=[4, 8, 16, 32]),
101
+ mask_head=dict(
102
+ type='FCNMaskHead',
103
+ num_convs=4,
104
+ in_channels=256,
105
+ conv_out_channels=256,
106
+ num_classes=80,
107
+ loss_mask=dict(
108
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
109
+ # model training and testing settings
110
+ train_cfg=dict(
111
+ rpn=dict(
112
+ assigner=dict(
113
+ type='MaxIoUAssigner',
114
+ pos_iou_thr=0.7,
115
+ neg_iou_thr=0.3,
116
+ min_pos_iou=0.3,
117
+ match_low_quality=True,
118
+ ignore_iof_thr=-1),
119
+ sampler=dict(
120
+ type='RandomSampler',
121
+ num=256,
122
+ pos_fraction=0.5,
123
+ neg_pos_ub=-1,
124
+ add_gt_as_proposals=False),
125
+ allowed_border=0,
126
+ pos_weight=-1,
127
+ debug=False),
128
+ rpn_proposal=dict(
129
+ nms_pre=2000,
130
+ max_per_img=2000,
131
+ nms=dict(type='nms', iou_threshold=0.7),
132
+ min_bbox_size=0),
133
+ rcnn=[
134
+ dict(
135
+ assigner=dict(
136
+ type='MaxIoUAssigner',
137
+ pos_iou_thr=0.5,
138
+ neg_iou_thr=0.5,
139
+ min_pos_iou=0.5,
140
+ match_low_quality=False,
141
+ ignore_iof_thr=-1),
142
+ sampler=dict(
143
+ type='RandomSampler',
144
+ num=512,
145
+ pos_fraction=0.25,
146
+ neg_pos_ub=-1,
147
+ add_gt_as_proposals=True),
148
+ mask_size=28,
149
+ pos_weight=-1,
150
+ debug=False),
151
+ dict(
152
+ assigner=dict(
153
+ type='MaxIoUAssigner',
154
+ pos_iou_thr=0.6,
155
+ neg_iou_thr=0.6,
156
+ min_pos_iou=0.6,
157
+ match_low_quality=False,
158
+ ignore_iof_thr=-1),
159
+ sampler=dict(
160
+ type='RandomSampler',
161
+ num=512,
162
+ pos_fraction=0.25,
163
+ neg_pos_ub=-1,
164
+ add_gt_as_proposals=True),
165
+ mask_size=28,
166
+ pos_weight=-1,
167
+ debug=False),
168
+ dict(
169
+ assigner=dict(
170
+ type='MaxIoUAssigner',
171
+ pos_iou_thr=0.7,
172
+ neg_iou_thr=0.7,
173
+ min_pos_iou=0.7,
174
+ match_low_quality=False,
175
+ ignore_iof_thr=-1),
176
+ sampler=dict(
177
+ type='RandomSampler',
178
+ num=512,
179
+ pos_fraction=0.25,
180
+ neg_pos_ub=-1,
181
+ add_gt_as_proposals=True),
182
+ mask_size=28,
183
+ pos_weight=-1,
184
+ debug=False)
185
+ ]),
186
+ test_cfg=dict(
187
+ rpn=dict(
188
+ nms_pre=1000,
189
+ max_per_img=1000,
190
+ nms=dict(type='nms', iou_threshold=0.7),
191
+ min_bbox_size=0),
192
+ rcnn=dict(
193
+ score_thr=0.05,
194
+ nms=dict(type='nms', iou_threshold=0.5),
195
+ max_per_img=100,
196
+ mask_thr_binary=0.5)))
configs/_base_/models/cascade_mask_rcnn_swin_fpn.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='CascadeRCNN',
4
+ pretrained=None,
5
+ backbone=dict(
6
+ type='SwinTransformer',
7
+ embed_dim=96,
8
+ depths=[2, 2, 6, 2],
9
+ num_heads=[3, 6, 12, 24],
10
+ window_size=7,
11
+ mlp_ratio=4.,
12
+ qkv_bias=True,
13
+ qk_scale=None,
14
+ drop_rate=0.,
15
+ attn_drop_rate=0.,
16
+ drop_path_rate=0.2,
17
+ ape=False,
18
+ patch_norm=True,
19
+ out_indices=(0, 1, 2, 3),
20
+ use_checkpoint=False),
21
+ neck=dict(
22
+ type='FPN',
23
+ in_channels=[96, 192, 384, 768],
24
+ out_channels=256,
25
+ num_outs=5),
26
+ rpn_head=dict(
27
+ type='RPNHead',
28
+ in_channels=256,
29
+ feat_channels=256,
30
+ anchor_generator=dict(
31
+ type='AnchorGenerator',
32
+ scales=[8],
33
+ ratios=[0.5, 1.0, 2.0],
34
+ strides=[4, 8, 16, 32, 64]),
35
+ bbox_coder=dict(
36
+ type='DeltaXYWHBBoxCoder',
37
+ target_means=[.0, .0, .0, .0],
38
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
39
+ loss_cls=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
41
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
42
+ roi_head=dict(
43
+ type='CascadeRoIHead',
44
+ num_stages=3,
45
+ stage_loss_weights=[1, 0.5, 0.25],
46
+ bbox_roi_extractor=dict(
47
+ type='SingleRoIExtractor',
48
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
49
+ out_channels=256,
50
+ featmap_strides=[4, 8, 16, 32]),
51
+ bbox_head=[
52
+ dict(
53
+ type='Shared2FCBBoxHead',
54
+ in_channels=256,
55
+ fc_out_channels=1024,
56
+ roi_feat_size=7,
57
+ num_classes=80,
58
+ bbox_coder=dict(
59
+ type='DeltaXYWHBBoxCoder',
60
+ target_means=[0., 0., 0., 0.],
61
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
62
+ reg_class_agnostic=True,
63
+ loss_cls=dict(
64
+ type='CrossEntropyLoss',
65
+ use_sigmoid=False,
66
+ loss_weight=1.0),
67
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
68
+ loss_weight=1.0)),
69
+ dict(
70
+ type='Shared2FCBBoxHead',
71
+ in_channels=256,
72
+ fc_out_channels=1024,
73
+ roi_feat_size=7,
74
+ num_classes=80,
75
+ bbox_coder=dict(
76
+ type='DeltaXYWHBBoxCoder',
77
+ target_means=[0., 0., 0., 0.],
78
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
79
+ reg_class_agnostic=True,
80
+ loss_cls=dict(
81
+ type='CrossEntropyLoss',
82
+ use_sigmoid=False,
83
+ loss_weight=1.0),
84
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
85
+ loss_weight=1.0)),
86
+ dict(
87
+ type='Shared2FCBBoxHead',
88
+ in_channels=256,
89
+ fc_out_channels=1024,
90
+ roi_feat_size=7,
91
+ num_classes=80,
92
+ bbox_coder=dict(
93
+ type='DeltaXYWHBBoxCoder',
94
+ target_means=[0., 0., 0., 0.],
95
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
96
+ reg_class_agnostic=True,
97
+ loss_cls=dict(
98
+ type='CrossEntropyLoss',
99
+ use_sigmoid=False,
100
+ loss_weight=1.0),
101
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
102
+ ],
103
+ mask_roi_extractor=dict(
104
+ type='SingleRoIExtractor',
105
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
106
+ out_channels=256,
107
+ featmap_strides=[4, 8, 16, 32]),
108
+ mask_head=dict(
109
+ type='FCNMaskHead',
110
+ num_convs=4,
111
+ in_channels=256,
112
+ conv_out_channels=256,
113
+ num_classes=80,
114
+ loss_mask=dict(
115
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
116
+ # model training and testing settings
117
+ train_cfg = dict(
118
+ rpn=dict(
119
+ assigner=dict(
120
+ type='MaxIoUAssigner',
121
+ pos_iou_thr=0.7,
122
+ neg_iou_thr=0.3,
123
+ min_pos_iou=0.3,
124
+ match_low_quality=True,
125
+ ignore_iof_thr=-1),
126
+ sampler=dict(
127
+ type='RandomSampler',
128
+ num=256,
129
+ pos_fraction=0.5,
130
+ neg_pos_ub=-1,
131
+ add_gt_as_proposals=False),
132
+ allowed_border=0,
133
+ pos_weight=-1,
134
+ debug=False),
135
+ rpn_proposal=dict(
136
+ nms_across_levels=False,
137
+ nms_pre=2000,
138
+ nms_post=2000,
139
+ max_per_img=2000,
140
+ nms=dict(type='nms', iou_threshold=0.7),
141
+ min_bbox_size=0),
142
+ rcnn=[
143
+ dict(
144
+ assigner=dict(
145
+ type='MaxIoUAssigner',
146
+ pos_iou_thr=0.5,
147
+ neg_iou_thr=0.5,
148
+ min_pos_iou=0.5,
149
+ match_low_quality=False,
150
+ ignore_iof_thr=-1),
151
+ sampler=dict(
152
+ type='RandomSampler',
153
+ num=512,
154
+ pos_fraction=0.25,
155
+ neg_pos_ub=-1,
156
+ add_gt_as_proposals=True),
157
+ mask_size=28,
158
+ pos_weight=-1,
159
+ debug=False),
160
+ dict(
161
+ assigner=dict(
162
+ type='MaxIoUAssigner',
163
+ pos_iou_thr=0.6,
164
+ neg_iou_thr=0.6,
165
+ min_pos_iou=0.6,
166
+ match_low_quality=False,
167
+ ignore_iof_thr=-1),
168
+ sampler=dict(
169
+ type='RandomSampler',
170
+ num=512,
171
+ pos_fraction=0.25,
172
+ neg_pos_ub=-1,
173
+ add_gt_as_proposals=True),
174
+ mask_size=28,
175
+ pos_weight=-1,
176
+ debug=False),
177
+ dict(
178
+ assigner=dict(
179
+ type='MaxIoUAssigner',
180
+ pos_iou_thr=0.7,
181
+ neg_iou_thr=0.7,
182
+ min_pos_iou=0.7,
183
+ match_low_quality=False,
184
+ ignore_iof_thr=-1),
185
+ sampler=dict(
186
+ type='RandomSampler',
187
+ num=512,
188
+ pos_fraction=0.25,
189
+ neg_pos_ub=-1,
190
+ add_gt_as_proposals=True),
191
+ mask_size=28,
192
+ pos_weight=-1,
193
+ debug=False)
194
+ ]),
195
+ test_cfg = dict(
196
+ rpn=dict(
197
+ nms_across_levels=False,
198
+ nms_pre=1000,
199
+ nms_post=1000,
200
+ max_per_img=1000,
201
+ nms=dict(type='nms', iou_threshold=0.7),
202
+ min_bbox_size=0),
203
+ rcnn=dict(
204
+ score_thr=0.05,
205
+ nms=dict(type='nms', iou_threshold=0.5),
206
+ max_per_img=100,
207
+ mask_thr_binary=0.5)))
configs/_base_/models/cascade_mask_rcnn_uniformer_fpn.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='CascadeRCNN',
4
+ pretrained=None,
5
+ backbone=dict(
6
+ type='UniFormer',
7
+ embed_dim=[64, 128, 320, 512],
8
+ layers=[3, 4, 8, 3],
9
+ head_dim=64,
10
+ mlp_ratio=4.,
11
+ qkv_bias=True,
12
+ drop_rate=0.,
13
+ attn_drop_rate=0.,
14
+ drop_path_rate=0.2),
15
+ neck=dict(
16
+ type='FPN',
17
+ in_channels=[64, 128, 320, 512],
18
+ out_channels=256,
19
+ num_outs=5),
20
+ rpn_head=dict(
21
+ type='RPNHead',
22
+ in_channels=256,
23
+ feat_channels=256,
24
+ anchor_generator=dict(
25
+ type='AnchorGenerator',
26
+ scales=[8],
27
+ ratios=[0.5, 1.0, 2.0],
28
+ strides=[4, 8, 16, 32, 64]),
29
+ bbox_coder=dict(
30
+ type='DeltaXYWHBBoxCoder',
31
+ target_means=[.0, .0, .0, .0],
32
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
33
+ loss_cls=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
35
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
36
+ roi_head=dict(
37
+ type='CascadeRoIHead',
38
+ num_stages=3,
39
+ stage_loss_weights=[1, 0.5, 0.25],
40
+ bbox_roi_extractor=dict(
41
+ type='SingleRoIExtractor',
42
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
43
+ out_channels=256,
44
+ featmap_strides=[4, 8, 16, 32]),
45
+ bbox_head=[
46
+ dict(
47
+ type='Shared2FCBBoxHead',
48
+ in_channels=256,
49
+ fc_out_channels=1024,
50
+ roi_feat_size=7,
51
+ num_classes=80,
52
+ bbox_coder=dict(
53
+ type='DeltaXYWHBBoxCoder',
54
+ target_means=[0., 0., 0., 0.],
55
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
56
+ reg_class_agnostic=True,
57
+ loss_cls=dict(
58
+ type='CrossEntropyLoss',
59
+ use_sigmoid=False,
60
+ loss_weight=1.0),
61
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
62
+ loss_weight=1.0)),
63
+ dict(
64
+ type='Shared2FCBBoxHead',
65
+ in_channels=256,
66
+ fc_out_channels=1024,
67
+ roi_feat_size=7,
68
+ num_classes=80,
69
+ bbox_coder=dict(
70
+ type='DeltaXYWHBBoxCoder',
71
+ target_means=[0., 0., 0., 0.],
72
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
73
+ reg_class_agnostic=True,
74
+ loss_cls=dict(
75
+ type='CrossEntropyLoss',
76
+ use_sigmoid=False,
77
+ loss_weight=1.0),
78
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
79
+ loss_weight=1.0)),
80
+ dict(
81
+ type='Shared2FCBBoxHead',
82
+ in_channels=256,
83
+ fc_out_channels=1024,
84
+ roi_feat_size=7,
85
+ num_classes=80,
86
+ bbox_coder=dict(
87
+ type='DeltaXYWHBBoxCoder',
88
+ target_means=[0., 0., 0., 0.],
89
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
90
+ reg_class_agnostic=True,
91
+ loss_cls=dict(
92
+ type='CrossEntropyLoss',
93
+ use_sigmoid=False,
94
+ loss_weight=1.0),
95
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
96
+ ],
97
+ mask_roi_extractor=dict(
98
+ type='SingleRoIExtractor',
99
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
100
+ out_channels=256,
101
+ featmap_strides=[4, 8, 16, 32]),
102
+ mask_head=dict(
103
+ type='FCNMaskHead',
104
+ num_convs=4,
105
+ in_channels=256,
106
+ conv_out_channels=256,
107
+ num_classes=80,
108
+ loss_mask=dict(
109
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
110
+ # model training and testing settings
111
+ train_cfg = dict(
112
+ rpn=dict(
113
+ assigner=dict(
114
+ type='MaxIoUAssigner',
115
+ pos_iou_thr=0.7,
116
+ neg_iou_thr=0.3,
117
+ min_pos_iou=0.3,
118
+ match_low_quality=True,
119
+ ignore_iof_thr=-1),
120
+ sampler=dict(
121
+ type='RandomSampler',
122
+ num=256,
123
+ pos_fraction=0.5,
124
+ neg_pos_ub=-1,
125
+ add_gt_as_proposals=False),
126
+ allowed_border=0,
127
+ pos_weight=-1,
128
+ debug=False),
129
+ rpn_proposal=dict(
130
+ nms_across_levels=False,
131
+ nms_pre=2000,
132
+ nms_post=2000,
133
+ max_per_img=2000,
134
+ nms=dict(type='nms', iou_threshold=0.7),
135
+ min_bbox_size=0),
136
+ rcnn=[
137
+ dict(
138
+ assigner=dict(
139
+ type='MaxIoUAssigner',
140
+ pos_iou_thr=0.5,
141
+ neg_iou_thr=0.5,
142
+ min_pos_iou=0.5,
143
+ match_low_quality=False,
144
+ ignore_iof_thr=-1),
145
+ sampler=dict(
146
+ type='RandomSampler',
147
+ num=512,
148
+ pos_fraction=0.25,
149
+ neg_pos_ub=-1,
150
+ add_gt_as_proposals=True),
151
+ mask_size=28,
152
+ pos_weight=-1,
153
+ debug=False),
154
+ dict(
155
+ assigner=dict(
156
+ type='MaxIoUAssigner',
157
+ pos_iou_thr=0.6,
158
+ neg_iou_thr=0.6,
159
+ min_pos_iou=0.6,
160
+ match_low_quality=False,
161
+ ignore_iof_thr=-1),
162
+ sampler=dict(
163
+ type='RandomSampler',
164
+ num=512,
165
+ pos_fraction=0.25,
166
+ neg_pos_ub=-1,
167
+ add_gt_as_proposals=True),
168
+ mask_size=28,
169
+ pos_weight=-1,
170
+ debug=False),
171
+ dict(
172
+ assigner=dict(
173
+ type='MaxIoUAssigner',
174
+ pos_iou_thr=0.7,
175
+ neg_iou_thr=0.7,
176
+ min_pos_iou=0.7,
177
+ match_low_quality=False,
178
+ ignore_iof_thr=-1),
179
+ sampler=dict(
180
+ type='RandomSampler',
181
+ num=512,
182
+ pos_fraction=0.25,
183
+ neg_pos_ub=-1,
184
+ add_gt_as_proposals=True),
185
+ mask_size=28,
186
+ pos_weight=-1,
187
+ debug=False)
188
+ ]),
189
+ test_cfg = dict(
190
+ rpn=dict(
191
+ nms_across_levels=False,
192
+ nms_pre=1000,
193
+ nms_post=1000,
194
+ max_per_img=1000,
195
+ nms=dict(type='nms', iou_threshold=0.7),
196
+ min_bbox_size=0),
197
+ rcnn=dict(
198
+ score_thr=0.05,
199
+ nms=dict(type='nms', iou_threshold=0.5),
200
+ max_per_img=100,
201
+ mask_thr_binary=0.5)))
configs/_base_/models/cascade_rcnn_r50_fpn.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='CascadeRCNN',
4
+ pretrained='torchvision://resnet50',
5
+ backbone=dict(
6
+ type='ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ rpn_head=dict(
20
+ type='RPNHead',
21
+ in_channels=256,
22
+ feat_channels=256,
23
+ anchor_generator=dict(
24
+ type='AnchorGenerator',
25
+ scales=[8],
26
+ ratios=[0.5, 1.0, 2.0],
27
+ strides=[4, 8, 16, 32, 64]),
28
+ bbox_coder=dict(
29
+ type='DeltaXYWHBBoxCoder',
30
+ target_means=[.0, .0, .0, .0],
31
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
32
+ loss_cls=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
35
+ roi_head=dict(
36
+ type='CascadeRoIHead',
37
+ num_stages=3,
38
+ stage_loss_weights=[1, 0.5, 0.25],
39
+ bbox_roi_extractor=dict(
40
+ type='SingleRoIExtractor',
41
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
42
+ out_channels=256,
43
+ featmap_strides=[4, 8, 16, 32]),
44
+ bbox_head=[
45
+ dict(
46
+ type='Shared2FCBBoxHead',
47
+ in_channels=256,
48
+ fc_out_channels=1024,
49
+ roi_feat_size=7,
50
+ num_classes=80,
51
+ bbox_coder=dict(
52
+ type='DeltaXYWHBBoxCoder',
53
+ target_means=[0., 0., 0., 0.],
54
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
55
+ reg_class_agnostic=True,
56
+ loss_cls=dict(
57
+ type='CrossEntropyLoss',
58
+ use_sigmoid=False,
59
+ loss_weight=1.0),
60
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
61
+ loss_weight=1.0)),
62
+ dict(
63
+ type='Shared2FCBBoxHead',
64
+ in_channels=256,
65
+ fc_out_channels=1024,
66
+ roi_feat_size=7,
67
+ num_classes=80,
68
+ bbox_coder=dict(
69
+ type='DeltaXYWHBBoxCoder',
70
+ target_means=[0., 0., 0., 0.],
71
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
72
+ reg_class_agnostic=True,
73
+ loss_cls=dict(
74
+ type='CrossEntropyLoss',
75
+ use_sigmoid=False,
76
+ loss_weight=1.0),
77
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
78
+ loss_weight=1.0)),
79
+ dict(
80
+ type='Shared2FCBBoxHead',
81
+ in_channels=256,
82
+ fc_out_channels=1024,
83
+ roi_feat_size=7,
84
+ num_classes=80,
85
+ bbox_coder=dict(
86
+ type='DeltaXYWHBBoxCoder',
87
+ target_means=[0., 0., 0., 0.],
88
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
89
+ reg_class_agnostic=True,
90
+ loss_cls=dict(
91
+ type='CrossEntropyLoss',
92
+ use_sigmoid=False,
93
+ loss_weight=1.0),
94
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
95
+ ]),
96
+ # model training and testing settings
97
+ train_cfg=dict(
98
+ rpn=dict(
99
+ assigner=dict(
100
+ type='MaxIoUAssigner',
101
+ pos_iou_thr=0.7,
102
+ neg_iou_thr=0.3,
103
+ min_pos_iou=0.3,
104
+ match_low_quality=True,
105
+ ignore_iof_thr=-1),
106
+ sampler=dict(
107
+ type='RandomSampler',
108
+ num=256,
109
+ pos_fraction=0.5,
110
+ neg_pos_ub=-1,
111
+ add_gt_as_proposals=False),
112
+ allowed_border=0,
113
+ pos_weight=-1,
114
+ debug=False),
115
+ rpn_proposal=dict(
116
+ nms_pre=2000,
117
+ max_per_img=2000,
118
+ nms=dict(type='nms', iou_threshold=0.7),
119
+ min_bbox_size=0),
120
+ rcnn=[
121
+ dict(
122
+ assigner=dict(
123
+ type='MaxIoUAssigner',
124
+ pos_iou_thr=0.5,
125
+ neg_iou_thr=0.5,
126
+ min_pos_iou=0.5,
127
+ match_low_quality=False,
128
+ ignore_iof_thr=-1),
129
+ sampler=dict(
130
+ type='RandomSampler',
131
+ num=512,
132
+ pos_fraction=0.25,
133
+ neg_pos_ub=-1,
134
+ add_gt_as_proposals=True),
135
+ pos_weight=-1,
136
+ debug=False),
137
+ dict(
138
+ assigner=dict(
139
+ type='MaxIoUAssigner',
140
+ pos_iou_thr=0.6,
141
+ neg_iou_thr=0.6,
142
+ min_pos_iou=0.6,
143
+ match_low_quality=False,
144
+ ignore_iof_thr=-1),
145
+ sampler=dict(
146
+ type='RandomSampler',
147
+ num=512,
148
+ pos_fraction=0.25,
149
+ neg_pos_ub=-1,
150
+ add_gt_as_proposals=True),
151
+ pos_weight=-1,
152
+ debug=False),
153
+ dict(
154
+ assigner=dict(
155
+ type='MaxIoUAssigner',
156
+ pos_iou_thr=0.7,
157
+ neg_iou_thr=0.7,
158
+ min_pos_iou=0.7,
159
+ match_low_quality=False,
160
+ ignore_iof_thr=-1),
161
+ sampler=dict(
162
+ type='RandomSampler',
163
+ num=512,
164
+ pos_fraction=0.25,
165
+ neg_pos_ub=-1,
166
+ add_gt_as_proposals=True),
167
+ pos_weight=-1,
168
+ debug=False)
169
+ ]),
170
+ test_cfg=dict(
171
+ rpn=dict(
172
+ nms_pre=1000,
173
+ max_per_img=1000,
174
+ nms=dict(type='nms', iou_threshold=0.7),
175
+ min_bbox_size=0),
176
+ rcnn=dict(
177
+ score_thr=0.05,
178
+ nms=dict(type='nms', iou_threshold=0.5),
179
+ max_per_img=100)))
configs/_base_/models/fast_rcnn_r50_fpn.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='FastRCNN',
4
+ pretrained='torchvision://resnet50',
5
+ backbone=dict(
6
+ type='ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ roi_head=dict(
20
+ type='StandardRoIHead',
21
+ bbox_roi_extractor=dict(
22
+ type='SingleRoIExtractor',
23
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
24
+ out_channels=256,
25
+ featmap_strides=[4, 8, 16, 32]),
26
+ bbox_head=dict(
27
+ type='Shared2FCBBoxHead',
28
+ in_channels=256,
29
+ fc_out_channels=1024,
30
+ roi_feat_size=7,
31
+ num_classes=80,
32
+ bbox_coder=dict(
33
+ type='DeltaXYWHBBoxCoder',
34
+ target_means=[0., 0., 0., 0.],
35
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
36
+ reg_class_agnostic=False,
37
+ loss_cls=dict(
38
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
39
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
40
+ # model training and testing settings
41
+ train_cfg=dict(
42
+ rcnn=dict(
43
+ assigner=dict(
44
+ type='MaxIoUAssigner',
45
+ pos_iou_thr=0.5,
46
+ neg_iou_thr=0.5,
47
+ min_pos_iou=0.5,
48
+ match_low_quality=False,
49
+ ignore_iof_thr=-1),
50
+ sampler=dict(
51
+ type='RandomSampler',
52
+ num=512,
53
+ pos_fraction=0.25,
54
+ neg_pos_ub=-1,
55
+ add_gt_as_proposals=True),
56
+ pos_weight=-1,
57
+ debug=False)),
58
+ test_cfg=dict(
59
+ rcnn=dict(
60
+ score_thr=0.05,
61
+ nms=dict(type='nms', iou_threshold=0.5),
62
+ max_per_img=100)))
configs/_base_/models/faster_rcnn_r50_caffe_c4.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='BN', requires_grad=False)
3
+ model = dict(
4
+ type='FasterRCNN',
5
+ pretrained='open-mmlab://detectron2/resnet50_caffe',
6
+ backbone=dict(
7
+ type='ResNet',
8
+ depth=50,
9
+ num_stages=3,
10
+ strides=(1, 2, 2),
11
+ dilations=(1, 1, 1),
12
+ out_indices=(2, ),
13
+ frozen_stages=1,
14
+ norm_cfg=norm_cfg,
15
+ norm_eval=True,
16
+ style='caffe'),
17
+ rpn_head=dict(
18
+ type='RPNHead',
19
+ in_channels=1024,
20
+ feat_channels=1024,
21
+ anchor_generator=dict(
22
+ type='AnchorGenerator',
23
+ scales=[2, 4, 8, 16, 32],
24
+ ratios=[0.5, 1.0, 2.0],
25
+ strides=[16]),
26
+ bbox_coder=dict(
27
+ type='DeltaXYWHBBoxCoder',
28
+ target_means=[.0, .0, .0, .0],
29
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
30
+ loss_cls=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
32
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
33
+ roi_head=dict(
34
+ type='StandardRoIHead',
35
+ shared_head=dict(
36
+ type='ResLayer',
37
+ depth=50,
38
+ stage=3,
39
+ stride=2,
40
+ dilation=1,
41
+ style='caffe',
42
+ norm_cfg=norm_cfg,
43
+ norm_eval=True),
44
+ bbox_roi_extractor=dict(
45
+ type='SingleRoIExtractor',
46
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
47
+ out_channels=1024,
48
+ featmap_strides=[16]),
49
+ bbox_head=dict(
50
+ type='BBoxHead',
51
+ with_avg_pool=True,
52
+ roi_feat_size=7,
53
+ in_channels=2048,
54
+ num_classes=80,
55
+ bbox_coder=dict(
56
+ type='DeltaXYWHBBoxCoder',
57
+ target_means=[0., 0., 0., 0.],
58
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
59
+ reg_class_agnostic=False,
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
62
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
63
+ # model training and testing settings
64
+ train_cfg=dict(
65
+ rpn=dict(
66
+ assigner=dict(
67
+ type='MaxIoUAssigner',
68
+ pos_iou_thr=0.7,
69
+ neg_iou_thr=0.3,
70
+ min_pos_iou=0.3,
71
+ match_low_quality=True,
72
+ ignore_iof_thr=-1),
73
+ sampler=dict(
74
+ type='RandomSampler',
75
+ num=256,
76
+ pos_fraction=0.5,
77
+ neg_pos_ub=-1,
78
+ add_gt_as_proposals=False),
79
+ allowed_border=0,
80
+ pos_weight=-1,
81
+ debug=False),
82
+ rpn_proposal=dict(
83
+ nms_pre=12000,
84
+ max_per_img=2000,
85
+ nms=dict(type='nms', iou_threshold=0.7),
86
+ min_bbox_size=0),
87
+ rcnn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.5,
91
+ neg_iou_thr=0.5,
92
+ min_pos_iou=0.5,
93
+ match_low_quality=False,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=512,
98
+ pos_fraction=0.25,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=True),
101
+ pos_weight=-1,
102
+ debug=False)),
103
+ test_cfg=dict(
104
+ rpn=dict(
105
+ nms_pre=6000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ score_thr=0.05,
111
+ nms=dict(type='nms', iou_threshold=0.5),
112
+ max_per_img=100)))
configs/_base_/models/faster_rcnn_r50_caffe_dc5.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='BN', requires_grad=False)
3
+ model = dict(
4
+ type='FasterRCNN',
5
+ pretrained='open-mmlab://detectron2/resnet50_caffe',
6
+ backbone=dict(
7
+ type='ResNet',
8
+ depth=50,
9
+ num_stages=4,
10
+ strides=(1, 2, 2, 1),
11
+ dilations=(1, 1, 1, 2),
12
+ out_indices=(3, ),
13
+ frozen_stages=1,
14
+ norm_cfg=norm_cfg,
15
+ norm_eval=True,
16
+ style='caffe'),
17
+ rpn_head=dict(
18
+ type='RPNHead',
19
+ in_channels=2048,
20
+ feat_channels=2048,
21
+ anchor_generator=dict(
22
+ type='AnchorGenerator',
23
+ scales=[2, 4, 8, 16, 32],
24
+ ratios=[0.5, 1.0, 2.0],
25
+ strides=[16]),
26
+ bbox_coder=dict(
27
+ type='DeltaXYWHBBoxCoder',
28
+ target_means=[.0, .0, .0, .0],
29
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
30
+ loss_cls=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
32
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
33
+ roi_head=dict(
34
+ type='StandardRoIHead',
35
+ bbox_roi_extractor=dict(
36
+ type='SingleRoIExtractor',
37
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
38
+ out_channels=2048,
39
+ featmap_strides=[16]),
40
+ bbox_head=dict(
41
+ type='Shared2FCBBoxHead',
42
+ in_channels=2048,
43
+ fc_out_channels=1024,
44
+ roi_feat_size=7,
45
+ num_classes=80,
46
+ bbox_coder=dict(
47
+ type='DeltaXYWHBBoxCoder',
48
+ target_means=[0., 0., 0., 0.],
49
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
50
+ reg_class_agnostic=False,
51
+ loss_cls=dict(
52
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
53
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
54
+ # model training and testing settings
55
+ train_cfg=dict(
56
+ rpn=dict(
57
+ assigner=dict(
58
+ type='MaxIoUAssigner',
59
+ pos_iou_thr=0.7,
60
+ neg_iou_thr=0.3,
61
+ min_pos_iou=0.3,
62
+ match_low_quality=True,
63
+ ignore_iof_thr=-1),
64
+ sampler=dict(
65
+ type='RandomSampler',
66
+ num=256,
67
+ pos_fraction=0.5,
68
+ neg_pos_ub=-1,
69
+ add_gt_as_proposals=False),
70
+ allowed_border=0,
71
+ pos_weight=-1,
72
+ debug=False),
73
+ rpn_proposal=dict(
74
+ nms_pre=12000,
75
+ max_per_img=2000,
76
+ nms=dict(type='nms', iou_threshold=0.7),
77
+ min_bbox_size=0),
78
+ rcnn=dict(
79
+ assigner=dict(
80
+ type='MaxIoUAssigner',
81
+ pos_iou_thr=0.5,
82
+ neg_iou_thr=0.5,
83
+ min_pos_iou=0.5,
84
+ match_low_quality=False,
85
+ ignore_iof_thr=-1),
86
+ sampler=dict(
87
+ type='RandomSampler',
88
+ num=512,
89
+ pos_fraction=0.25,
90
+ neg_pos_ub=-1,
91
+ add_gt_as_proposals=True),
92
+ pos_weight=-1,
93
+ debug=False)),
94
+ test_cfg=dict(
95
+ rpn=dict(
96
+ nms=dict(type='nms', iou_threshold=0.7),
97
+ nms_pre=6000,
98
+ max_per_img=1000,
99
+ min_bbox_size=0),
100
+ rcnn=dict(
101
+ score_thr=0.05,
102
+ nms=dict(type='nms', iou_threshold=0.5),
103
+ max_per_img=100)))
configs/_base_/models/faster_rcnn_r50_fpn.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FasterRCNN',
3
+ pretrained='torchvision://resnet50',
4
+ backbone=dict(
5
+ type='ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ norm_eval=True,
12
+ style='pytorch'),
13
+ neck=dict(
14
+ type='FPN',
15
+ in_channels=[256, 512, 1024, 2048],
16
+ out_channels=256,
17
+ num_outs=5),
18
+ rpn_head=dict(
19
+ type='RPNHead',
20
+ in_channels=256,
21
+ feat_channels=256,
22
+ anchor_generator=dict(
23
+ type='AnchorGenerator',
24
+ scales=[8],
25
+ ratios=[0.5, 1.0, 2.0],
26
+ strides=[4, 8, 16, 32, 64]),
27
+ bbox_coder=dict(
28
+ type='DeltaXYWHBBoxCoder',
29
+ target_means=[.0, .0, .0, .0],
30
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
31
+ loss_cls=dict(
32
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
33
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
34
+ roi_head=dict(
35
+ type='StandardRoIHead',
36
+ bbox_roi_extractor=dict(
37
+ type='SingleRoIExtractor',
38
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
39
+ out_channels=256,
40
+ featmap_strides=[4, 8, 16, 32]),
41
+ bbox_head=dict(
42
+ type='Shared2FCBBoxHead',
43
+ in_channels=256,
44
+ fc_out_channels=1024,
45
+ roi_feat_size=7,
46
+ num_classes=80,
47
+ bbox_coder=dict(
48
+ type='DeltaXYWHBBoxCoder',
49
+ target_means=[0., 0., 0., 0.],
50
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
51
+ reg_class_agnostic=False,
52
+ loss_cls=dict(
53
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
54
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
55
+ # model training and testing settings
56
+ train_cfg=dict(
57
+ rpn=dict(
58
+ assigner=dict(
59
+ type='MaxIoUAssigner',
60
+ pos_iou_thr=0.7,
61
+ neg_iou_thr=0.3,
62
+ min_pos_iou=0.3,
63
+ match_low_quality=True,
64
+ ignore_iof_thr=-1),
65
+ sampler=dict(
66
+ type='RandomSampler',
67
+ num=256,
68
+ pos_fraction=0.5,
69
+ neg_pos_ub=-1,
70
+ add_gt_as_proposals=False),
71
+ allowed_border=-1,
72
+ pos_weight=-1,
73
+ debug=False),
74
+ rpn_proposal=dict(
75
+ nms_pre=2000,
76
+ max_per_img=1000,
77
+ nms=dict(type='nms', iou_threshold=0.7),
78
+ min_bbox_size=0),
79
+ rcnn=dict(
80
+ assigner=dict(
81
+ type='MaxIoUAssigner',
82
+ pos_iou_thr=0.5,
83
+ neg_iou_thr=0.5,
84
+ min_pos_iou=0.5,
85
+ match_low_quality=False,
86
+ ignore_iof_thr=-1),
87
+ sampler=dict(
88
+ type='RandomSampler',
89
+ num=512,
90
+ pos_fraction=0.25,
91
+ neg_pos_ub=-1,
92
+ add_gt_as_proposals=True),
93
+ pos_weight=-1,
94
+ debug=False)),
95
+ test_cfg=dict(
96
+ rpn=dict(
97
+ nms_pre=1000,
98
+ max_per_img=1000,
99
+ nms=dict(type='nms', iou_threshold=0.7),
100
+ min_bbox_size=0),
101
+ rcnn=dict(
102
+ score_thr=0.05,
103
+ nms=dict(type='nms', iou_threshold=0.5),
104
+ max_per_img=100)
105
+ # soft-nms is also supported for rcnn testing
106
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
107
+ ))
configs/_base_/models/mask_rcnn_r50_caffe_c4.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='BN', requires_grad=False)
3
+ model = dict(
4
+ type='MaskRCNN',
5
+ pretrained='open-mmlab://detectron2/resnet50_caffe',
6
+ backbone=dict(
7
+ type='ResNet',
8
+ depth=50,
9
+ num_stages=3,
10
+ strides=(1, 2, 2),
11
+ dilations=(1, 1, 1),
12
+ out_indices=(2, ),
13
+ frozen_stages=1,
14
+ norm_cfg=norm_cfg,
15
+ norm_eval=True,
16
+ style='caffe'),
17
+ rpn_head=dict(
18
+ type='RPNHead',
19
+ in_channels=1024,
20
+ feat_channels=1024,
21
+ anchor_generator=dict(
22
+ type='AnchorGenerator',
23
+ scales=[2, 4, 8, 16, 32],
24
+ ratios=[0.5, 1.0, 2.0],
25
+ strides=[16]),
26
+ bbox_coder=dict(
27
+ type='DeltaXYWHBBoxCoder',
28
+ target_means=[.0, .0, .0, .0],
29
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
30
+ loss_cls=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
32
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
33
+ roi_head=dict(
34
+ type='StandardRoIHead',
35
+ shared_head=dict(
36
+ type='ResLayer',
37
+ depth=50,
38
+ stage=3,
39
+ stride=2,
40
+ dilation=1,
41
+ style='caffe',
42
+ norm_cfg=norm_cfg,
43
+ norm_eval=True),
44
+ bbox_roi_extractor=dict(
45
+ type='SingleRoIExtractor',
46
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
47
+ out_channels=1024,
48
+ featmap_strides=[16]),
49
+ bbox_head=dict(
50
+ type='BBoxHead',
51
+ with_avg_pool=True,
52
+ roi_feat_size=7,
53
+ in_channels=2048,
54
+ num_classes=80,
55
+ bbox_coder=dict(
56
+ type='DeltaXYWHBBoxCoder',
57
+ target_means=[0., 0., 0., 0.],
58
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
59
+ reg_class_agnostic=False,
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
62
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
63
+ mask_roi_extractor=None,
64
+ mask_head=dict(
65
+ type='FCNMaskHead',
66
+ num_convs=0,
67
+ in_channels=2048,
68
+ conv_out_channels=256,
69
+ num_classes=80,
70
+ loss_mask=dict(
71
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
72
+ # model training and testing settings
73
+ train_cfg=dict(
74
+ rpn=dict(
75
+ assigner=dict(
76
+ type='MaxIoUAssigner',
77
+ pos_iou_thr=0.7,
78
+ neg_iou_thr=0.3,
79
+ min_pos_iou=0.3,
80
+ match_low_quality=True,
81
+ ignore_iof_thr=-1),
82
+ sampler=dict(
83
+ type='RandomSampler',
84
+ num=256,
85
+ pos_fraction=0.5,
86
+ neg_pos_ub=-1,
87
+ add_gt_as_proposals=False),
88
+ allowed_border=0,
89
+ pos_weight=-1,
90
+ debug=False),
91
+ rpn_proposal=dict(
92
+ nms_pre=12000,
93
+ max_per_img=2000,
94
+ nms=dict(type='nms', iou_threshold=0.7),
95
+ min_bbox_size=0),
96
+ rcnn=dict(
97
+ assigner=dict(
98
+ type='MaxIoUAssigner',
99
+ pos_iou_thr=0.5,
100
+ neg_iou_thr=0.5,
101
+ min_pos_iou=0.5,
102
+ match_low_quality=False,
103
+ ignore_iof_thr=-1),
104
+ sampler=dict(
105
+ type='RandomSampler',
106
+ num=512,
107
+ pos_fraction=0.25,
108
+ neg_pos_ub=-1,
109
+ add_gt_as_proposals=True),
110
+ mask_size=14,
111
+ pos_weight=-1,
112
+ debug=False)),
113
+ test_cfg=dict(
114
+ rpn=dict(
115
+ nms_pre=6000,
116
+ nms=dict(type='nms', iou_threshold=0.7),
117
+ max_per_img=1000,
118
+ min_bbox_size=0),
119
+ rcnn=dict(
120
+ score_thr=0.05,
121
+ nms=dict(type='nms', iou_threshold=0.5),
122
+ max_per_img=100,
123
+ mask_thr_binary=0.5)))
configs/_base_/models/mask_rcnn_r50_fpn.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='MaskRCNN',
4
+ pretrained='torchvision://resnet50',
5
+ backbone=dict(
6
+ type='ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ rpn_head=dict(
20
+ type='RPNHead',
21
+ in_channels=256,
22
+ feat_channels=256,
23
+ anchor_generator=dict(
24
+ type='AnchorGenerator',
25
+ scales=[8],
26
+ ratios=[0.5, 1.0, 2.0],
27
+ strides=[4, 8, 16, 32, 64]),
28
+ bbox_coder=dict(
29
+ type='DeltaXYWHBBoxCoder',
30
+ target_means=[.0, .0, .0, .0],
31
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
32
+ loss_cls=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35
+ roi_head=dict(
36
+ type='StandardRoIHead',
37
+ bbox_roi_extractor=dict(
38
+ type='SingleRoIExtractor',
39
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
40
+ out_channels=256,
41
+ featmap_strides=[4, 8, 16, 32]),
42
+ bbox_head=dict(
43
+ type='Shared2FCBBoxHead',
44
+ in_channels=256,
45
+ fc_out_channels=1024,
46
+ roi_feat_size=7,
47
+ num_classes=80,
48
+ bbox_coder=dict(
49
+ type='DeltaXYWHBBoxCoder',
50
+ target_means=[0., 0., 0., 0.],
51
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
52
+ reg_class_agnostic=False,
53
+ loss_cls=dict(
54
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
55
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
56
+ mask_roi_extractor=dict(
57
+ type='SingleRoIExtractor',
58
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
59
+ out_channels=256,
60
+ featmap_strides=[4, 8, 16, 32]),
61
+ mask_head=dict(
62
+ type='FCNMaskHead',
63
+ num_convs=4,
64
+ in_channels=256,
65
+ conv_out_channels=256,
66
+ num_classes=80,
67
+ loss_mask=dict(
68
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
69
+ # model training and testing settings
70
+ train_cfg=dict(
71
+ rpn=dict(
72
+ assigner=dict(
73
+ type='MaxIoUAssigner',
74
+ pos_iou_thr=0.7,
75
+ neg_iou_thr=0.3,
76
+ min_pos_iou=0.3,
77
+ match_low_quality=True,
78
+ ignore_iof_thr=-1),
79
+ sampler=dict(
80
+ type='RandomSampler',
81
+ num=256,
82
+ pos_fraction=0.5,
83
+ neg_pos_ub=-1,
84
+ add_gt_as_proposals=False),
85
+ allowed_border=-1,
86
+ pos_weight=-1,
87
+ debug=False),
88
+ rpn_proposal=dict(
89
+ nms_pre=2000,
90
+ max_per_img=1000,
91
+ nms=dict(type='nms', iou_threshold=0.7),
92
+ min_bbox_size=0),
93
+ rcnn=dict(
94
+ assigner=dict(
95
+ type='MaxIoUAssigner',
96
+ pos_iou_thr=0.5,
97
+ neg_iou_thr=0.5,
98
+ min_pos_iou=0.5,
99
+ match_low_quality=True,
100
+ ignore_iof_thr=-1),
101
+ sampler=dict(
102
+ type='RandomSampler',
103
+ num=512,
104
+ pos_fraction=0.25,
105
+ neg_pos_ub=-1,
106
+ add_gt_as_proposals=True),
107
+ mask_size=28,
108
+ pos_weight=-1,
109
+ debug=False)),
110
+ test_cfg=dict(
111
+ rpn=dict(
112
+ nms_pre=1000,
113
+ max_per_img=1000,
114
+ nms=dict(type='nms', iou_threshold=0.7),
115
+ min_bbox_size=0),
116
+ rcnn=dict(
117
+ score_thr=0.05,
118
+ nms=dict(type='nms', iou_threshold=0.5),
119
+ max_per_img=100,
120
+ mask_thr_binary=0.5)))
configs/_base_/models/mask_rcnn_swin_fpn.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='MaskRCNN',
4
+ pretrained=None,
5
+ backbone=dict(
6
+ type='SwinTransformer',
7
+ embed_dim=96,
8
+ depths=[2, 2, 6, 2],
9
+ num_heads=[3, 6, 12, 24],
10
+ window_size=7,
11
+ mlp_ratio=4.,
12
+ qkv_bias=True,
13
+ qk_scale=None,
14
+ drop_rate=0.,
15
+ attn_drop_rate=0.,
16
+ drop_path_rate=0.2,
17
+ ape=False,
18
+ patch_norm=True,
19
+ out_indices=(0, 1, 2, 3),
20
+ use_checkpoint=False),
21
+ neck=dict(
22
+ type='FPN',
23
+ in_channels=[96, 192, 384, 768],
24
+ out_channels=256,
25
+ num_outs=5),
26
+ rpn_head=dict(
27
+ type='RPNHead',
28
+ in_channels=256,
29
+ feat_channels=256,
30
+ anchor_generator=dict(
31
+ type='AnchorGenerator',
32
+ scales=[8],
33
+ ratios=[0.5, 1.0, 2.0],
34
+ strides=[4, 8, 16, 32, 64]),
35
+ bbox_coder=dict(
36
+ type='DeltaXYWHBBoxCoder',
37
+ target_means=[.0, .0, .0, .0],
38
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
39
+ loss_cls=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
41
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
42
+ roi_head=dict(
43
+ type='StandardRoIHead',
44
+ bbox_roi_extractor=dict(
45
+ type='SingleRoIExtractor',
46
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
47
+ out_channels=256,
48
+ featmap_strides=[4, 8, 16, 32]),
49
+ bbox_head=dict(
50
+ type='Shared2FCBBoxHead',
51
+ in_channels=256,
52
+ fc_out_channels=1024,
53
+ roi_feat_size=7,
54
+ num_classes=80,
55
+ bbox_coder=dict(
56
+ type='DeltaXYWHBBoxCoder',
57
+ target_means=[0., 0., 0., 0.],
58
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
59
+ reg_class_agnostic=False,
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
62
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
63
+ mask_roi_extractor=dict(
64
+ type='SingleRoIExtractor',
65
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
66
+ out_channels=256,
67
+ featmap_strides=[4, 8, 16, 32]),
68
+ mask_head=dict(
69
+ type='FCNMaskHead',
70
+ num_convs=4,
71
+ in_channels=256,
72
+ conv_out_channels=256,
73
+ num_classes=80,
74
+ loss_mask=dict(
75
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
76
+ # model training and testing settings
77
+ train_cfg=dict(
78
+ rpn=dict(
79
+ assigner=dict(
80
+ type='MaxIoUAssigner',
81
+ pos_iou_thr=0.7,
82
+ neg_iou_thr=0.3,
83
+ min_pos_iou=0.3,
84
+ match_low_quality=True,
85
+ ignore_iof_thr=-1),
86
+ sampler=dict(
87
+ type='RandomSampler',
88
+ num=256,
89
+ pos_fraction=0.5,
90
+ neg_pos_ub=-1,
91
+ add_gt_as_proposals=False),
92
+ allowed_border=-1,
93
+ pos_weight=-1,
94
+ debug=False),
95
+ rpn_proposal=dict(
96
+ nms_pre=2000,
97
+ max_per_img=1000,
98
+ nms=dict(type='nms', iou_threshold=0.7),
99
+ min_bbox_size=0),
100
+ rcnn=dict(
101
+ assigner=dict(
102
+ type='MaxIoUAssigner',
103
+ pos_iou_thr=0.5,
104
+ neg_iou_thr=0.5,
105
+ min_pos_iou=0.5,
106
+ match_low_quality=True,
107
+ ignore_iof_thr=-1),
108
+ sampler=dict(
109
+ type='RandomSampler',
110
+ num=512,
111
+ pos_fraction=0.25,
112
+ neg_pos_ub=-1,
113
+ add_gt_as_proposals=True),
114
+ mask_size=28,
115
+ pos_weight=-1,
116
+ debug=False)),
117
+ test_cfg=dict(
118
+ rpn=dict(
119
+ nms_pre=1000,
120
+ max_per_img=1000,
121
+ nms=dict(type='nms', iou_threshold=0.7),
122
+ min_bbox_size=0),
123
+ rcnn=dict(
124
+ score_thr=0.05,
125
+ nms=dict(type='nms', iou_threshold=0.5),
126
+ max_per_img=100,
127
+ mask_thr_binary=0.5)))
configs/_base_/models/mask_rcnn_uniformer_fpn.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='MaskRCNN',
4
+ pretrained=None,
5
+ backbone=dict(
6
+ type='UniFormer',
7
+ embed_dim=[64, 128, 320, 512],
8
+ layers=[3, 4, 8, 3],
9
+ head_dim=64,
10
+ mlp_ratio=4.,
11
+ qkv_bias=True,
12
+ drop_rate=0.,
13
+ attn_drop_rate=0.,
14
+ drop_path_rate=0.2),
15
+ neck=dict(
16
+ type='FPN',
17
+ in_channels=[64, 128, 320, 512],
18
+ out_channels=256,
19
+ num_outs=5),
20
+ rpn_head=dict(
21
+ type='RPNHead',
22
+ in_channels=256,
23
+ feat_channels=256,
24
+ anchor_generator=dict(
25
+ type='AnchorGenerator',
26
+ scales=[8],
27
+ ratios=[0.5, 1.0, 2.0],
28
+ strides=[4, 8, 16, 32, 64]),
29
+ bbox_coder=dict(
30
+ type='DeltaXYWHBBoxCoder',
31
+ target_means=[.0, .0, .0, .0],
32
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
33
+ loss_cls=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
35
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
36
+ roi_head=dict(
37
+ type='StandardRoIHead',
38
+ bbox_roi_extractor=dict(
39
+ type='SingleRoIExtractor',
40
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
41
+ out_channels=256,
42
+ featmap_strides=[4, 8, 16, 32]),
43
+ bbox_head=dict(
44
+ type='Shared2FCBBoxHead',
45
+ in_channels=256,
46
+ fc_out_channels=1024,
47
+ roi_feat_size=7,
48
+ num_classes=80,
49
+ bbox_coder=dict(
50
+ type='DeltaXYWHBBoxCoder',
51
+ target_means=[0., 0., 0., 0.],
52
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
53
+ reg_class_agnostic=False,
54
+ loss_cls=dict(
55
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
56
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
57
+ mask_roi_extractor=dict(
58
+ type='SingleRoIExtractor',
59
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
60
+ out_channels=256,
61
+ featmap_strides=[4, 8, 16, 32]),
62
+ mask_head=dict(
63
+ type='FCNMaskHead',
64
+ num_convs=4,
65
+ in_channels=256,
66
+ conv_out_channels=256,
67
+ num_classes=80,
68
+ loss_mask=dict(
69
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1),
80
+ sampler=dict(
81
+ type='RandomSampler',
82
+ num=256,
83
+ pos_fraction=0.5,
84
+ neg_pos_ub=-1,
85
+ add_gt_as_proposals=False),
86
+ allowed_border=-1,
87
+ pos_weight=-1,
88
+ debug=False),
89
+ rpn_proposal=dict(
90
+ nms_pre=2000,
91
+ max_per_img=1000,
92
+ nms=dict(type='nms', iou_threshold=0.7),
93
+ min_bbox_size=0),
94
+ rcnn=dict(
95
+ assigner=dict(
96
+ type='MaxIoUAssigner',
97
+ pos_iou_thr=0.5,
98
+ neg_iou_thr=0.5,
99
+ min_pos_iou=0.5,
100
+ match_low_quality=True,
101
+ ignore_iof_thr=-1),
102
+ sampler=dict(
103
+ type='RandomSampler',
104
+ num=512,
105
+ pos_fraction=0.25,
106
+ neg_pos_ub=-1,
107
+ add_gt_as_proposals=True),
108
+ mask_size=28,
109
+ pos_weight=-1,
110
+ debug=False)),
111
+ test_cfg=dict(
112
+ rpn=dict(
113
+ nms_pre=1000,
114
+ max_per_img=1000,
115
+ nms=dict(type='nms', iou_threshold=0.7),
116
+ min_bbox_size=0),
117
+ rcnn=dict(
118
+ score_thr=0.05,
119
+ nms=dict(type='nms', iou_threshold=0.5),
120
+ max_per_img=100,
121
+ mask_thr_binary=0.5)))
configs/_base_/models/retinanet_r50_fpn.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='RetinaNet',
4
+ pretrained='torchvision://resnet50',
5
+ backbone=dict(
6
+ type='ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ start_level=1,
19
+ add_extra_convs='on_input',
20
+ num_outs=5),
21
+ bbox_head=dict(
22
+ type='RetinaHead',
23
+ num_classes=80,
24
+ in_channels=256,
25
+ stacked_convs=4,
26
+ feat_channels=256,
27
+ anchor_generator=dict(
28
+ type='AnchorGenerator',
29
+ octave_base_scale=4,
30
+ scales_per_octave=3,
31
+ ratios=[0.5, 1.0, 2.0],
32
+ strides=[8, 16, 32, 64, 128]),
33
+ bbox_coder=dict(
34
+ type='DeltaXYWHBBoxCoder',
35
+ target_means=[.0, .0, .0, .0],
36
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
37
+ loss_cls=dict(
38
+ type='FocalLoss',
39
+ use_sigmoid=True,
40
+ gamma=2.0,
41
+ alpha=0.25,
42
+ loss_weight=1.0),
43
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
44
+ # training and testing settings
45
+ train_cfg=dict(
46
+ assigner=dict(
47
+ type='MaxIoUAssigner',
48
+ pos_iou_thr=0.5,
49
+ neg_iou_thr=0.4,
50
+ min_pos_iou=0,
51
+ ignore_iof_thr=-1),
52
+ allowed_border=-1,
53
+ pos_weight=-1,
54
+ debug=False),
55
+ test_cfg=dict(
56
+ nms_pre=1000,
57
+ min_bbox_size=0,
58
+ score_thr=0.05,
59
+ nms=dict(type='nms', iou_threshold=0.5),
60
+ max_per_img=100))
configs/_base_/models/rpn_r50_caffe_c4.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='RPN',
4
+ pretrained='open-mmlab://detectron2/resnet50_caffe',
5
+ backbone=dict(
6
+ type='ResNet',
7
+ depth=50,
8
+ num_stages=3,
9
+ strides=(1, 2, 2),
10
+ dilations=(1, 1, 1),
11
+ out_indices=(2, ),
12
+ frozen_stages=1,
13
+ norm_cfg=dict(type='BN', requires_grad=False),
14
+ norm_eval=True,
15
+ style='caffe'),
16
+ neck=None,
17
+ rpn_head=dict(
18
+ type='RPNHead',
19
+ in_channels=1024,
20
+ feat_channels=1024,
21
+ anchor_generator=dict(
22
+ type='AnchorGenerator',
23
+ scales=[2, 4, 8, 16, 32],
24
+ ratios=[0.5, 1.0, 2.0],
25
+ strides=[16]),
26
+ bbox_coder=dict(
27
+ type='DeltaXYWHBBoxCoder',
28
+ target_means=[.0, .0, .0, .0],
29
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
30
+ loss_cls=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
32
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
33
+ # model training and testing settings
34
+ train_cfg=dict(
35
+ rpn=dict(
36
+ assigner=dict(
37
+ type='MaxIoUAssigner',
38
+ pos_iou_thr=0.7,
39
+ neg_iou_thr=0.3,
40
+ min_pos_iou=0.3,
41
+ ignore_iof_thr=-1),
42
+ sampler=dict(
43
+ type='RandomSampler',
44
+ num=256,
45
+ pos_fraction=0.5,
46
+ neg_pos_ub=-1,
47
+ add_gt_as_proposals=False),
48
+ allowed_border=0,
49
+ pos_weight=-1,
50
+ debug=False)),
51
+ test_cfg=dict(
52
+ rpn=dict(
53
+ nms_pre=12000,
54
+ max_per_img=2000,
55
+ nms=dict(type='nms', iou_threshold=0.7),
56
+ min_bbox_size=0)))
configs/_base_/models/rpn_r50_fpn.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+
3
+ model = dict(
4
+ type='RPN',
5
+ pretrained='torchvision://resnet50',
6
+ backbone=dict(
7
+ type='ResNet',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ frozen_stages=1,
12
+ norm_cfg=dict(type='BN', requires_grad=True),
13
+ norm_eval=True,
14
+ style='pytorch'),
15
+ neck=dict(
16
+ type='FPN',
17
+ in_channels=[256, 512, 1024, 2048],
18
+ out_channels=256,
19
+ num_outs=5),
20
+ rpn_head=dict(
21
+ type='RPNHead',
22
+ in_channels=256,
23
+ feat_channels=256,
24
+ anchor_generator=dict(
25
+ type='AnchorGenerator',
26
+ scales=[8],
27
+ ratios=[0.5, 1.0, 2.0],
28
+ strides=[4, 8, 16, 32, 64]),
29
+ bbox_coder=dict(
30
+ type='DeltaXYWHBBoxCoder',
31
+ target_means=[.0, .0, .0, .0],
32
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
33
+ loss_cls=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
35
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
36
+ # model training and testing settings
37
+ train_cfg=dict(
38
+ rpn=dict(
39
+ assigner=dict(
40
+ type='MaxIoUAssigner',
41
+ pos_iou_thr=0.7,
42
+ neg_iou_thr=0.3,
43
+ min_pos_iou=0.3,
44
+ ignore_iof_thr=-1),
45
+ sampler=dict(
46
+ type='RandomSampler',
47
+ num=256,
48
+ pos_fraction=0.5,
49
+ neg_pos_ub=-1,
50
+ add_gt_as_proposals=False),
51
+ allowed_border=0,
52
+ pos_weight=-1,
53
+ debug=False)),
54
+ test_cfg=dict(
55
+ rpn=dict(
56
+ nms_pre=2000,
57
+ max_per_img=1000,
58
+ nms=dict(type='nms', iou_threshold=0.7),
59
+ min_bbox_size=0)))
configs/_base_/models/ssd300.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ input_size = 300
3
+ model = dict(
4
+ type='SingleStageDetector',
5
+ pretrained='open-mmlab://vgg16_caffe',
6
+ backbone=dict(
7
+ type='SSDVGG',
8
+ input_size=input_size,
9
+ depth=16,
10
+ with_last_pool=False,
11
+ ceil_mode=True,
12
+ out_indices=(3, 4),
13
+ out_feature_indices=(22, 34),
14
+ l2_norm_scale=20),
15
+ neck=None,
16
+ bbox_head=dict(
17
+ type='SSDHead',
18
+ in_channels=(512, 1024, 512, 256, 256, 256),
19
+ num_classes=80,
20
+ anchor_generator=dict(
21
+ type='SSDAnchorGenerator',
22
+ scale_major=False,
23
+ input_size=input_size,
24
+ basesize_ratio_range=(0.15, 0.9),
25
+ strides=[8, 16, 32, 64, 100, 300],
26
+ ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
27
+ bbox_coder=dict(
28
+ type='DeltaXYWHBBoxCoder',
29
+ target_means=[.0, .0, .0, .0],
30
+ target_stds=[0.1, 0.1, 0.2, 0.2])),
31
+ train_cfg=dict(
32
+ assigner=dict(
33
+ type='MaxIoUAssigner',
34
+ pos_iou_thr=0.5,
35
+ neg_iou_thr=0.5,
36
+ min_pos_iou=0.,
37
+ ignore_iof_thr=-1,
38
+ gt_max_assign_all=False),
39
+ smoothl1_beta=1.,
40
+ allowed_border=-1,
41
+ pos_weight=-1,
42
+ neg_pos_ratio=3,
43
+ debug=False),
44
+ test_cfg=dict(
45
+ nms_pre=1000,
46
+ nms=dict(type='nms', iou_threshold=0.45),
47
+ min_bbox_size=0,
48
+ score_thr=0.02,
49
+ max_per_img=200))
50
+ cudnn_benchmark = True
configs/_base_/schedules/schedule_1x.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
3
+ optimizer_config = dict(grad_clip=None)
4
+ # learning policy
5
+ lr_config = dict(
6
+ policy='step',
7
+ warmup='linear',
8
+ warmup_iters=500,
9
+ warmup_ratio=0.001,
10
+ step=[8, 11])
11
+ runner = dict(type='EpochBasedRunner', max_epochs=12)
configs/_base_/schedules/schedule_20e.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
3
+ optimizer_config = dict(grad_clip=None)
4
+ # learning policy
5
+ lr_config = dict(
6
+ policy='step',
7
+ warmup='linear',
8
+ warmup_iters=500,
9
+ warmup_ratio=0.001,
10
+ step=[16, 19])
11
+ runner = dict(type='EpochBasedRunner', max_epochs=20)
configs/_base_/schedules/schedule_2x.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
3
+ optimizer_config = dict(grad_clip=None)
4
+ # learning policy
5
+ lr_config = dict(
6
+ policy='step',
7
+ warmup='linear',
8
+ warmup_iters=500,
9
+ warmup_ratio=0.001,
10
+ step=[16, 22])
11
+ runner = dict(type='EpochBasedRunner', max_epochs=24)
configs/albu_example/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Albu Example
2
+
3
+ [OTHERS]
4
+
5
+ ```
6
+ @article{2018arXiv180906839B,
7
+ author = {A. Buslaev, A. Parinov, E. Khvedchenya, V.~I. Iglovikov and A.~A. Kalinin},
8
+ title = "{Albumentations: fast and flexible image augmentations}",
9
+ journal = {ArXiv e-prints},
10
+ eprint = {1809.06839},
11
+ year = 2018
12
+ }
13
+ ```
14
+
15
+ ## Results and Models
16
+
17
+ | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download |
18
+ |:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:-------:|:------:|:--------:|
19
+ | R-50 | pytorch | 1x | 4.4 | 16.6 | 38.0 | 34.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208-ab203bcd.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208_225520.log.json) |
configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
2
+ img_norm_cfg = dict(
3
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
4
+ albu_train_transforms = [
5
+ dict(
6
+ type='ShiftScaleRotate',
7
+ shift_limit=0.0625,
8
+ scale_limit=0.0,
9
+ rotate_limit=0,
10
+ interpolation=1,
11
+ p=0.5),
12
+ dict(
13
+ type='RandomBrightnessContrast',
14
+ brightness_limit=[0.1, 0.3],
15
+ contrast_limit=[0.1, 0.3],
16
+ p=0.2),
17
+ dict(
18
+ type='OneOf',
19
+ transforms=[
20
+ dict(
21
+ type='RGBShift',
22
+ r_shift_limit=10,
23
+ g_shift_limit=10,
24
+ b_shift_limit=10,
25
+ p=1.0),
26
+ dict(
27
+ type='HueSaturationValue',
28
+ hue_shift_limit=20,
29
+ sat_shift_limit=30,
30
+ val_shift_limit=20,
31
+ p=1.0)
32
+ ],
33
+ p=0.1),
34
+ dict(type='JpegCompression', quality_lower=85, quality_upper=95, p=0.2),
35
+ dict(type='ChannelShuffle', p=0.1),
36
+ dict(
37
+ type='OneOf',
38
+ transforms=[
39
+ dict(type='Blur', blur_limit=3, p=1.0),
40
+ dict(type='MedianBlur', blur_limit=3, p=1.0)
41
+ ],
42
+ p=0.1),
43
+ ]
44
+ train_pipeline = [
45
+ dict(type='LoadImageFromFile'),
46
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
47
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
48
+ dict(type='Pad', size_divisor=32),
49
+ dict(
50
+ type='Albu',
51
+ transforms=albu_train_transforms,
52
+ bbox_params=dict(
53
+ type='BboxParams',
54
+ format='pascal_voc',
55
+ label_fields=['gt_labels'],
56
+ min_visibility=0.0,
57
+ filter_lost_elements=True),
58
+ keymap={
59
+ 'img': 'image',
60
+ 'gt_masks': 'masks',
61
+ 'gt_bboxes': 'bboxes'
62
+ },
63
+ update_pad_shape=False,
64
+ skip_img_without_anno=True),
65
+ dict(type='Normalize', **img_norm_cfg),
66
+ dict(type='DefaultFormatBundle'),
67
+ dict(
68
+ type='Collect',
69
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'],
70
+ meta_keys=('filename', 'ori_shape', 'img_shape', 'img_norm_cfg',
71
+ 'pad_shape', 'scale_factor'))
72
+ ]
73
+ data = dict(train=dict(pipeline=train_pipeline))
configs/atss/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection
2
+
3
+ ## Introduction
4
+
5
+ [ALGORITHM]
6
+
7
+ ```latex
8
+ @article{zhang2019bridging,
9
+ title = {Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection},
10
+ author = {Zhang, Shifeng and Chi, Cheng and Yao, Yongqiang and Lei, Zhen and Li, Stan Z.},
11
+ journal = {arXiv preprint arXiv:1912.02424},
12
+ year = {2019}
13
+ }
14
+ ```
15
+
16
+ ## Results and Models
17
+
18
+ | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
19
+ |:---------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:|
20
+ | R-50 | pytorch | 1x | 3.7 | 19.7 | 39.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/atss/atss_r50_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209_102539.log.json) |
21
+ | R-101 | pytorch | 1x | 5.6 | 12.3 | 41.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/atss/atss_r101_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.log.json) |
configs/atss/atss_r101_fpn_1x_coco.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ _base_ = './atss_r50_fpn_1x_coco.py'
2
+ model = dict(
3
+ pretrained='torchvision://resnet101',
4
+ backbone=dict(depth=101),
5
+ )
configs/atss/atss_r50_fpn_1x_coco.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../_base_/datasets/coco_detection.py',
3
+ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
4
+ ]
5
+ model = dict(
6
+ type='ATSS',
7
+ pretrained='torchvision://resnet50',
8
+ backbone=dict(
9
+ type='ResNet',
10
+ depth=50,
11
+ num_stages=4,
12
+ out_indices=(0, 1, 2, 3),
13
+ frozen_stages=1,
14
+ norm_cfg=dict(type='BN', requires_grad=True),
15
+ norm_eval=True,
16
+ style='pytorch'),
17
+ neck=dict(
18
+ type='FPN',
19
+ in_channels=[256, 512, 1024, 2048],
20
+ out_channels=256,
21
+ start_level=1,
22
+ add_extra_convs='on_output',
23
+ num_outs=5),
24
+ bbox_head=dict(
25
+ type='ATSSHead',
26
+ num_classes=80,
27
+ in_channels=256,
28
+ stacked_convs=4,
29
+ feat_channels=256,
30
+ anchor_generator=dict(
31
+ type='AnchorGenerator',
32
+ ratios=[1.0],
33
+ octave_base_scale=8,
34
+ scales_per_octave=1,
35
+ strides=[8, 16, 32, 64, 128]),
36
+ bbox_coder=dict(
37
+ type='DeltaXYWHBBoxCoder',
38
+ target_means=[.0, .0, .0, .0],
39
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
40
+ loss_cls=dict(
41
+ type='FocalLoss',
42
+ use_sigmoid=True,
43
+ gamma=2.0,
44
+ alpha=0.25,
45
+ loss_weight=1.0),
46
+ loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
47
+ loss_centerness=dict(
48
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
49
+ # training and testing settings
50
+ train_cfg=dict(
51
+ assigner=dict(type='ATSSAssigner', topk=9),
52
+ allowed_border=-1,
53
+ pos_weight=-1,
54
+ debug=False),
55
+ test_cfg=dict(
56
+ nms_pre=1000,
57
+ min_bbox_size=0,
58
+ score_thr=0.05,
59
+ nms=dict(type='nms', iou_threshold=0.6),
60
+ max_per_img=100))
61
+ # optimizer
62
+ optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
configs/carafe/README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CARAFE: Content-Aware ReAssembly of FEatures
2
+
3
+ ## Introduction
4
+
5
+ [ALGORITHM]
6
+
7
+ We provide config files to reproduce the object detection & instance segmentation results in the ICCV 2019 Oral paper for [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188).
8
+
9
+ ```
10
+ @inproceedings{Wang_2019_ICCV,
11
+ title = {CARAFE: Content-Aware ReAssembly of FEatures},
12
+ author = {Wang, Jiaqi and Chen, Kai and Xu, Rui and Liu, Ziwei and Loy, Chen Change and Lin, Dahua},
13
+ booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
14
+ month = {October},
15
+ year = {2019}
16
+ }
17
+ ```
18
+
19
+ ## Results and Models
20
+
21
+ The results on COCO 2017 val is shown in the below table.
22
+
23
+ | Method | Backbone | Style | Lr schd | Test Proposal Num | Inf time (fps) | Box AP | Mask AP | Config | Download |
24
+ |:--------------------:|:--------:|:-------:|:-------:|:-----------------:|:--------------:|:------:|:-------:|:------:|:--------:|
25
+ | Faster R-CNN w/ CARAFE | R-50-FPN | pytorch | 1x | 1000 | 16.5 | 38.6 | 38.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_20200504_175733.log.json) |
26
+ | - | - | - | - | 2000 | | | | |
27
+ | Mask R-CNN w/ CARAFE | R-50-FPN | pytorch | 1x | 1000 | 14.0 | 39.3 | 35.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_20200503_135957.log.json) |
28
+ | - | - | - | - | 2000 | | | | |
29
+
30
+ ## Implementation
31
+
32
+ The CUDA implementation of CARAFE can be find at https://github.com/myownskyW7/CARAFE.
configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
2
+ model = dict(
3
+ neck=dict(
4
+ type='FPN_CARAFE',
5
+ in_channels=[256, 512, 1024, 2048],
6
+ out_channels=256,
7
+ num_outs=5,
8
+ start_level=0,
9
+ end_level=-1,
10
+ norm_cfg=None,
11
+ act_cfg=None,
12
+ order=('conv', 'norm', 'act'),
13
+ upsample_cfg=dict(
14
+ type='carafe',
15
+ up_kernel=5,
16
+ up_group=1,
17
+ encoder_kernel=3,
18
+ encoder_dilation=1,
19
+ compressed_channels=64)))
20
+ img_norm_cfg = dict(
21
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
22
+ train_pipeline = [
23
+ dict(type='LoadImageFromFile'),
24
+ dict(type='LoadAnnotations', with_bbox=True),
25
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
26
+ dict(type='RandomFlip', flip_ratio=0.5),
27
+ dict(type='Normalize', **img_norm_cfg),
28
+ dict(type='Pad', size_divisor=64),
29
+ dict(type='DefaultFormatBundle'),
30
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
31
+ ]
32
+ test_pipeline = [
33
+ dict(type='LoadImageFromFile'),
34
+ dict(
35
+ type='MultiScaleFlipAug',
36
+ img_scale=(1333, 800),
37
+ flip=False,
38
+ transforms=[
39
+ dict(type='Resize', keep_ratio=True),
40
+ dict(type='RandomFlip'),
41
+ dict(type='Normalize', **img_norm_cfg),
42
+ dict(type='Pad', size_divisor=64),
43
+ dict(type='ImageToTensor', keys=['img']),
44
+ dict(type='Collect', keys=['img']),
45
+ ])
46
+ ]
47
+ data = dict(
48
+ train=dict(pipeline=train_pipeline),
49
+ val=dict(pipeline=test_pipeline),
50
+ test=dict(pipeline=test_pipeline))
configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
2
+ model = dict(
3
+ neck=dict(
4
+ type='FPN_CARAFE',
5
+ in_channels=[256, 512, 1024, 2048],
6
+ out_channels=256,
7
+ num_outs=5,
8
+ start_level=0,
9
+ end_level=-1,
10
+ norm_cfg=None,
11
+ act_cfg=None,
12
+ order=('conv', 'norm', 'act'),
13
+ upsample_cfg=dict(
14
+ type='carafe',
15
+ up_kernel=5,
16
+ up_group=1,
17
+ encoder_kernel=3,
18
+ encoder_dilation=1,
19
+ compressed_channels=64)),
20
+ roi_head=dict(
21
+ mask_head=dict(
22
+ upsample_cfg=dict(
23
+ type='carafe',
24
+ scale_factor=2,
25
+ up_kernel=5,
26
+ up_group=1,
27
+ encoder_kernel=3,
28
+ encoder_dilation=1,
29
+ compressed_channels=64))))
30
+ img_norm_cfg = dict(
31
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
32
+ train_pipeline = [
33
+ dict(type='LoadImageFromFile'),
34
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
35
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
36
+ dict(type='RandomFlip', flip_ratio=0.5),
37
+ dict(type='Normalize', **img_norm_cfg),
38
+ dict(type='Pad', size_divisor=64),
39
+ dict(type='DefaultFormatBundle'),
40
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
41
+ ]
42
+ test_pipeline = [
43
+ dict(type='LoadImageFromFile'),
44
+ dict(
45
+ type='MultiScaleFlipAug',
46
+ img_scale=(1333, 800),
47
+ flip=False,
48
+ transforms=[
49
+ dict(type='Resize', keep_ratio=True),
50
+ dict(type='RandomFlip'),
51
+ dict(type='Normalize', **img_norm_cfg),
52
+ dict(type='Pad', size_divisor=64),
53
+ dict(type='ImageToTensor', keys=['img']),
54
+ dict(type='Collect', keys=['img']),
55
+ ])
56
+ ]
57
+ data = dict(
58
+ train=dict(pipeline=train_pipeline),
59
+ val=dict(pipeline=test_pipeline),
60
+ test=dict(pipeline=test_pipeline))
configs/cascade_rcnn/README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cascade R-CNN: High Quality Object Detection and Instance Segmentation
2
+
3
+ ## Introduction
4
+
5
+ [ALGORITHM]
6
+
7
+ ```latex
8
+ @article{Cai_2019,
9
+ title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation},
10
+ ISSN={1939-3539},
11
+ url={http://dx.doi.org/10.1109/tpami.2019.2956516},
12
+ DOI={10.1109/tpami.2019.2956516},
13
+ journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
14
+ publisher={Institute of Electrical and Electronics Engineers (IEEE)},
15
+ author={Cai, Zhaowei and Vasconcelos, Nuno},
16
+ year={2019},
17
+ pages={1–1}
18
+ }
19
+ ```
20
+
21
+ ## Results and models
22
+
23
+ ### Cascade R-CNN
24
+
25
+ | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
26
+ | :-------------: | :-----: | :-----: | :------: | :------------: | :----: |:------:|:--------:|
27
+ | R-50-FPN | caffe | 1x | 4.2 | | 40.4 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_20200504_174853.log.json) |
28
+ | R-50-FPN | pytorch | 1x | 4.4 | 16.1 | 40.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316_214748.log.json) |
29
+ | R-50-FPN | pytorch | 20e | - | - | 41.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_20200504_175131.log.json) |
30
+ | R-101-FPN | caffe | 1x | 6.2 | | 42.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_20200504_175649.log.json) |
31
+ | R-101-FPN | pytorch | 1x | 6.4 | 13.5 | 42.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317_101744.log.json) |
32
+ | R-101-FPN | pytorch | 20e | - | - | 42.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_20200504_231812.log.json) |
33
+ | X-101-32x4d-FPN | pytorch | 1x | 7.6 | 10.9 | 43.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316_055608.log.json) |
34
+ | X-101-32x4d-FPN | pytorch | 20e | 7.6 | | 43.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608.log.json) |
35
+ | X-101-64x4d-FPN | pytorch | 1x | 10.7 | | 44.7 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702.log.json) |
36
+ | X-101-64x4d-FPN | pytorch | 20e | 10.7 | | 44.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357.log.json)|
37
+
38
+ ### Cascade Mask R-CNN
39
+
40
+ | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download |
41
+ | :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------: | :--------: |
42
+ | R-50-FPN | caffe | 1x | 5.9 | | 41.2 | 36.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_20200504_174659.log.json) |
43
+ | R-50-FPN | pytorch | 1x | 6.0 | 11.2 | 41.2 | 35.9 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203_170449.log.json) |
44
+ | R-50-FPN | pytorch | 20e | - | - | 41.9 | 36.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_20200504_174711.log.json)|
45
+ | R-101-FPN | caffe | 1x | 7.8 | | 43.2 | 37.6 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_20200504_174813.log.json)|
46
+ | R-101-FPN | pytorch | 1x | 7.9 | 9.8 | 42.9 | 37.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203_092521.log.json) |
47
+ | R-101-FPN | pytorch | 20e | - | - | 43.4 | 37.8 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_20200504_174836.log.json)|
48
+ | X-101-32x4d-FPN | pytorch | 1x | 9.2 | 8.6 | 44.3 | 38.3 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201_052416.log.json) |
49
+ | X-101-32x4d-FPN | pytorch | 20e | 9.2 | - | 45.0 | 39.0 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917.log.json) |
50
+ | X-101-64x4d-FPN | pytorch | 1x | 12.2 | 6.7 | 45.3 | 39.2 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203_044059.log.json) |
51
+ | X-101-64x4d-FPN | pytorch | 20e | 12.2 | | 45.6 |39.5 | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py) | [model](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth) &#124; [log](http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033.log.json)|
52
+
53
+ **Notes:**
54
+
55
+ - The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs.
configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _base_ = './cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py'
2
+ model = dict(
3
+ pretrained='open-mmlab://detectron2/resnet101_caffe',
4
+ backbone=dict(depth=101))
configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
2
+ model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py'
2
+ model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ['./cascade_mask_rcnn_r50_fpn_1x_coco.py']
2
+
3
+ model = dict(
4
+ pretrained='open-mmlab://detectron2/resnet50_caffe',
5
+ backbone=dict(
6
+ norm_cfg=dict(requires_grad=False), norm_eval=True, style='caffe'))
7
+
8
+ img_norm_cfg = dict(
9
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
10
+ train_pipeline = [
11
+ dict(type='LoadImageFromFile'),
12
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
13
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
14
+ dict(type='RandomFlip', flip_ratio=0.5),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size_divisor=32),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=(1333, 800),
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='Pad', size_divisor=32),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img']),
33
+ ])
34
+ ]
35
+ data = dict(
36
+ train=dict(pipeline=train_pipeline),
37
+ val=dict(pipeline=test_pipeline),
38
+ test=dict(pipeline=test_pipeline))
configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
3
+ '../_base_/datasets/coco_instance.py',
4
+ '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
5
+ ]
configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
3
+ '../_base_/datasets/coco_instance.py',
4
+ '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py'
5
+ ]
configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
2
+ model = dict(
3
+ pretrained='open-mmlab://resnext101_32x4d',
4
+ backbone=dict(
5
+ type='ResNeXt',
6
+ depth=101,
7
+ groups=32,
8
+ base_width=4,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ frozen_stages=1,
12
+ norm_cfg=dict(type='BN', requires_grad=True),
13
+ style='pytorch'))
configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py'
2
+ model = dict(
3
+ pretrained='open-mmlab://resnext101_32x4d',
4
+ backbone=dict(
5
+ type='ResNeXt',
6
+ depth=101,
7
+ groups=32,
8
+ base_width=4,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ frozen_stages=1,
12
+ norm_cfg=dict(type='BN', requires_grad=True),
13
+ style='pytorch'))
configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
2
+ model = dict(
3
+ pretrained='open-mmlab://resnext101_64x4d',
4
+ backbone=dict(
5
+ type='ResNeXt',
6
+ depth=101,
7
+ groups=64,
8
+ base_width=4,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ frozen_stages=1,
12
+ norm_cfg=dict(type='BN', requires_grad=True),
13
+ style='pytorch'))