dustysys commited on
Commit
888bcc3
1 Parent(s): f5c099f

add segm model

Browse files
mmdet/segm/mmdet_dd-person_mask2former.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff5ebfaacfef171c8043ef867c1035a2ab9949eca9813c1028746e85e035915
3
+ size 528418860
mmdet/segm/mmdet_dd-person_mask2former.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'CocoDataset'
2
+ data_root = 'data/dd-person_mask2former/'
3
+ img_norm_cfg = dict(
4
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile', to_float32=True),
7
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
8
+ dict(type='RandomFlip', flip_ratio=0.5),
9
+ dict(
10
+ type='Resize',
11
+ img_scale=(1024, 1024),
12
+ ratio_range=(0.1, 2.0),
13
+ multiscale_mode='range',
14
+ keep_ratio=True),
15
+ dict(
16
+ type='RandomCrop',
17
+ crop_size=(1024, 1024),
18
+ crop_type='absolute',
19
+ recompute_bbox=True,
20
+ allow_negative_crop=True),
21
+ dict(
22
+ type='FilterAnnotations', min_gt_bbox_wh=(1e-05, 1e-05), by_mask=True),
23
+ dict(
24
+ type='Pad',
25
+ size=(1024, 1024),
26
+ pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
27
+ dict(
28
+ type='Normalize',
29
+ mean=[123.675, 116.28, 103.53],
30
+ std=[58.395, 57.12, 57.375],
31
+ to_rgb=True),
32
+ dict(type='DefaultFormatBundle', img_to_float=True),
33
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
34
+ ]
35
+ test_pipeline = [
36
+ dict(type='LoadImageFromFile'),
37
+ dict(
38
+ type='MultiScaleFlipAug',
39
+ img_scale=(1333, 800),
40
+ flip=False,
41
+ transforms=[
42
+ dict(type='Resize', keep_ratio=True),
43
+ dict(type='RandomFlip'),
44
+ dict(
45
+ type='Pad',
46
+ size_divisor=32,
47
+ pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
48
+ dict(
49
+ type='Normalize',
50
+ mean=[123.675, 116.28, 103.53],
51
+ std=[58.395, 57.12, 57.375],
52
+ to_rgb=True),
53
+ dict(type='ImageToTensor', keys=['img']),
54
+ dict(type='Collect', keys=['img'])
55
+ ])
56
+ ]
57
+ data = dict(
58
+ samples_per_gpu=1,
59
+ workers_per_gpu=1,
60
+ train=dict(
61
+ type='CocoDataset',
62
+ ann_file='data/dd-person_mask2former/annotations/train.json',
63
+ img_prefix='data/dd-person_mask2former/train/',
64
+ pipeline=[
65
+ dict(type='LoadImageFromFile', to_float32=True),
66
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
67
+ dict(type='RandomFlip', flip_ratio=0.5),
68
+ dict(
69
+ type='Resize',
70
+ img_scale=(1024, 1024),
71
+ ratio_range=(0.1, 2.0),
72
+ multiscale_mode='range',
73
+ keep_ratio=True),
74
+ dict(
75
+ type='RandomCrop',
76
+ crop_size=(1024, 1024),
77
+ crop_type='absolute',
78
+ recompute_bbox=True,
79
+ allow_negative_crop=True),
80
+ dict(
81
+ type='FilterAnnotations',
82
+ min_gt_bbox_wh=(1e-05, 1e-05),
83
+ by_mask=True),
84
+ dict(
85
+ type='Pad',
86
+ size=(1024, 1024),
87
+ pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
88
+ dict(
89
+ type='Normalize',
90
+ mean=[123.675, 116.28, 103.53],
91
+ std=[58.395, 57.12, 57.375],
92
+ to_rgb=True),
93
+ dict(type='DefaultFormatBundle', img_to_float=True),
94
+ dict(
95
+ type='Collect',
96
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
97
+ ]),
98
+ val=dict(
99
+ type='CocoDataset',
100
+ ann_file='data/dd-person_mask2former/annotations/val.json',
101
+ img_prefix='data/dd-person_mask2former/val/',
102
+ pipeline=[
103
+ dict(type='LoadImageFromFile'),
104
+ dict(
105
+ type='MultiScaleFlipAug',
106
+ img_scale=(1333, 800),
107
+ flip=False,
108
+ transforms=[
109
+ dict(type='Resize', keep_ratio=True),
110
+ dict(type='RandomFlip'),
111
+ dict(
112
+ type='Pad',
113
+ size_divisor=32,
114
+ pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
115
+ dict(
116
+ type='Normalize',
117
+ mean=[123.675, 116.28, 103.53],
118
+ std=[58.395, 57.12, 57.375],
119
+ to_rgb=True),
120
+ dict(type='ImageToTensor', keys=['img']),
121
+ dict(type='Collect', keys=['img'])
122
+ ])
123
+ ]),
124
+ test=dict(
125
+ type='CocoDataset',
126
+ ann_file='data/dd-person_mask2former/annotations/val.json',
127
+ img_prefix='data/dd-person_mask2former/val/',
128
+ pipeline=[
129
+ dict(type='LoadImageFromFile'),
130
+ dict(
131
+ type='MultiScaleFlipAug',
132
+ img_scale=(1333, 800),
133
+ flip=False,
134
+ transforms=[
135
+ dict(type='Resize', keep_ratio=True),
136
+ dict(type='RandomFlip'),
137
+ dict(
138
+ type='Pad',
139
+ size_divisor=32,
140
+ pad_val=dict(img=(128, 128, 128), masks=0, seg=255)),
141
+ dict(
142
+ type='Normalize',
143
+ mean=[123.675, 116.28, 103.53],
144
+ std=[58.395, 57.12, 57.375],
145
+ to_rgb=True),
146
+ dict(type='ImageToTensor', keys=['img']),
147
+ dict(type='Collect', keys=['img'])
148
+ ])
149
+ ]))
150
+ evaluation = dict(
151
+ interval=2000,
152
+ metric=['bbox', 'segm'],
153
+ dynamic_intervals=[(400001, 400000)])
154
+ checkpoint_config = dict(
155
+ interval=2000, by_epoch=False, save_last=True, max_keep_ckpts=10)
156
+ log_config = dict(
157
+ interval=50,
158
+ hooks=[
159
+ dict(type='TextLoggerHook', by_epoch=False),
160
+ dict(type='TensorboardLoggerHook', by_epoch=False)
161
+ ])
162
+ custom_hooks = [dict(type='NumClassCheckHook')]
163
+ dist_params = dict(backend='nccl')
164
+ log_level = 'INFO'
165
+ load_from = 'checkpoints/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth'
166
+ resume_from = 'checkpoints/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth'
167
+ workflow = [('train', 2000)]
168
+ opencv_num_threads = 0
169
+ mp_start_method = 'fork'
170
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
171
+ num_things_classes = 1
172
+ num_stuff_classes = 0
173
+ num_classes = 1
174
+ model = dict(
175
+ type='Mask2Former',
176
+ backbone=dict(
177
+ type='ResNet',
178
+ depth=50,
179
+ num_stages=4,
180
+ out_indices=(0, 1, 2, 3),
181
+ frozen_stages=-1,
182
+ norm_cfg=dict(type='BN', requires_grad=False),
183
+ norm_eval=True,
184
+ style='pytorch',
185
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
186
+ panoptic_head=dict(
187
+ type='Mask2FormerHead',
188
+ in_channels=[256, 512, 1024, 2048],
189
+ strides=[4, 8, 16, 32],
190
+ feat_channels=256,
191
+ out_channels=256,
192
+ num_things_classes=1,
193
+ num_stuff_classes=0,
194
+ num_queries=100,
195
+ num_transformer_feat_level=3,
196
+ pixel_decoder=dict(
197
+ type='MSDeformAttnPixelDecoder',
198
+ num_outs=3,
199
+ norm_cfg=dict(type='GN', num_groups=32),
200
+ act_cfg=dict(type='ReLU'),
201
+ encoder=dict(
202
+ type='DetrTransformerEncoder',
203
+ num_layers=6,
204
+ transformerlayers=dict(
205
+ type='BaseTransformerLayer',
206
+ attn_cfgs=dict(
207
+ type='MultiScaleDeformableAttention',
208
+ embed_dims=256,
209
+ num_heads=8,
210
+ num_levels=3,
211
+ num_points=4,
212
+ im2col_step=64,
213
+ dropout=0.0,
214
+ batch_first=False,
215
+ norm_cfg=None,
216
+ init_cfg=None),
217
+ ffn_cfgs=dict(
218
+ type='FFN',
219
+ embed_dims=256,
220
+ feedforward_channels=1024,
221
+ num_fcs=2,
222
+ ffn_drop=0.0,
223
+ act_cfg=dict(type='ReLU', inplace=True)),
224
+ operation_order=('self_attn', 'norm', 'ffn', 'norm')),
225
+ init_cfg=None),
226
+ positional_encoding=dict(
227
+ type='SinePositionalEncoding', num_feats=128, normalize=True),
228
+ init_cfg=None),
229
+ enforce_decoder_input_project=False,
230
+ positional_encoding=dict(
231
+ type='SinePositionalEncoding', num_feats=128, normalize=True),
232
+ transformer_decoder=dict(
233
+ type='DetrTransformerDecoder',
234
+ return_intermediate=True,
235
+ num_layers=9,
236
+ transformerlayers=dict(
237
+ type='DetrTransformerDecoderLayer',
238
+ attn_cfgs=dict(
239
+ type='MultiheadAttention',
240
+ embed_dims=256,
241
+ num_heads=8,
242
+ attn_drop=0.0,
243
+ proj_drop=0.0,
244
+ dropout_layer=None,
245
+ batch_first=False),
246
+ ffn_cfgs=dict(
247
+ embed_dims=256,
248
+ feedforward_channels=2048,
249
+ num_fcs=2,
250
+ act_cfg=dict(type='ReLU', inplace=True),
251
+ ffn_drop=0.0,
252
+ dropout_layer=None,
253
+ add_identity=True),
254
+ feedforward_channels=2048,
255
+ operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
256
+ 'ffn', 'norm')),
257
+ init_cfg=None),
258
+ loss_cls=dict(
259
+ type='CrossEntropyLoss',
260
+ use_sigmoid=False,
261
+ loss_weight=2.0,
262
+ reduction='mean',
263
+ class_weight=[1.0, 0.1]),
264
+ loss_mask=dict(
265
+ type='CrossEntropyLoss',
266
+ use_sigmoid=True,
267
+ reduction='mean',
268
+ loss_weight=5.0),
269
+ loss_dice=dict(
270
+ type='DiceLoss',
271
+ use_sigmoid=True,
272
+ activate=True,
273
+ reduction='mean',
274
+ naive_dice=True,
275
+ eps=1.0,
276
+ loss_weight=5.0)),
277
+ panoptic_fusion_head=dict(
278
+ type='MaskFormerFusionHead',
279
+ num_things_classes=1,
280
+ num_stuff_classes=0,
281
+ loss_panoptic=None,
282
+ init_cfg=None),
283
+ train_cfg=dict(
284
+ num_points=12544,
285
+ oversample_ratio=3.0,
286
+ importance_sample_ratio=0.75,
287
+ assigner=dict(
288
+ type='MaskHungarianAssigner',
289
+ cls_cost=dict(type='ClassificationCost', weight=2.0),
290
+ mask_cost=dict(
291
+ type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
292
+ dice_cost=dict(
293
+ type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
294
+ sampler=dict(type='MaskPseudoSampler')),
295
+ test_cfg=dict(
296
+ panoptic_on=False,
297
+ semantic_on=False,
298
+ instance_on=True,
299
+ max_per_image=100,
300
+ iou_thr=0.8,
301
+ filter_low_score=True),
302
+ init_cfg=None)
303
+ image_size = (1024, 1024)
304
+ embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
305
+ optimizer = dict(
306
+ type='AdamW',
307
+ lr=0.0001,
308
+ weight_decay=0.05,
309
+ eps=1e-08,
310
+ betas=(0.9, 0.999),
311
+ paramwise_cfg=dict(
312
+ custom_keys=dict(
313
+ backbone=dict(lr_mult=0.1, decay_mult=1.0),
314
+ query_embed=dict(lr_mult=1.0, decay_mult=0.0),
315
+ query_feat=dict(lr_mult=1.0, decay_mult=0.0),
316
+ level_embed=dict(lr_mult=1.0, decay_mult=0.0)),
317
+ norm_decay_mult=0.0))
318
+ optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
319
+ lr_config = dict(
320
+ policy='step',
321
+ gamma=0.1,
322
+ by_epoch=False,
323
+ step=[327778, 355092],
324
+ warmup='linear',
325
+ warmup_by_epoch=False,
326
+ warmup_ratio=1.0,
327
+ warmup_iters=10)
328
+ max_iters = 400000
329
+ runner = dict(type='IterBasedRunner', max_iters=400000)
330
+ interval = 2000
331
+ dynamic_intervals = [(400001, 400000)]
332
+ pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255)
333
+ work_dir = './work_dirs\dd-person_mask2former'
334
+ auto_resume = False
335
+ gpu_ids = [0]