-
Notifications
You must be signed in to change notification settings - Fork 9.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Add Mask2Former to mmdet (#6938)
update doc update doc format deepcopy pixel_decoder cfg move mask_pseudo_sampler cfg to config file move part of postprocess from head to detector fix bug in postprocessing move class setting from head to config file remove if else move mask2bbox to mask/util update docstring update docstring in result2json fix bug update class_weight add maskformer_fusion_head add maskformer fusion head update add cfg for filter_low_score update maskformer update class_weight update config update unit test rename param update comments in config rename variable, rm arg, update unit tests update mask2bbox add unit test for mask2bbox replace unsqueeze(1) and squeeze(1) add unit test for maskformer_fusion_head update docstrings update docstring delete \ remove modification to ce loss update docstring update docstring update docstring of ce loss update unit test update docstring update docstring update docstring rename rename add msdeformattn pixel decoder maskformer refactor add strides in config remove redundant code remove redundant code update unit test update config update
- Loading branch information
Showing
13 changed files
with
1,212 additions
and
9 deletions.
There are no files selected for viewing
253 changes: 253 additions & 0 deletions
253
configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
_base_ = [ | ||
'../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' | ||
] | ||
num_things_classes = 80 | ||
num_stuff_classes = 53 | ||
num_classes = num_things_classes + num_stuff_classes | ||
model = dict( | ||
type='Mask2Former', | ||
backbone=dict( | ||
type='ResNet', | ||
depth=50, | ||
num_stages=4, | ||
out_indices=(0, 1, 2, 3), | ||
frozen_stages=-1, | ||
norm_cfg=dict(type='BN', requires_grad=False), | ||
norm_eval=True, | ||
style='pytorch', | ||
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), | ||
panoptic_head=dict( | ||
type='Mask2FormerHead', | ||
in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside | ||
strides=[4, 8, 16, 32], | ||
feat_channels=256, | ||
out_channels=256, | ||
num_things_classes=num_things_classes, | ||
num_stuff_classes=num_stuff_classes, | ||
num_queries=100, | ||
num_transformer_feat_level=3, | ||
pixel_decoder=dict( | ||
type='MSDeformAttnPixelDecoder', | ||
num_outs=3, | ||
norm_cfg=dict(type='GN', num_groups=32), | ||
act_cfg=dict(type='ReLU'), | ||
encoder=dict( | ||
type='DetrTransformerEncoder', | ||
num_layers=6, | ||
transformerlayers=dict( | ||
type='BaseTransformerLayer', | ||
attn_cfgs=dict( | ||
type='MultiScaleDeformableAttention', | ||
embed_dims=256, | ||
num_heads=8, | ||
num_levels=3, | ||
num_points=4, | ||
im2col_step=64, | ||
dropout=0.0, | ||
batch_first=False, | ||
norm_cfg=None, | ||
init_cfg=None), | ||
ffn_cfgs=dict( | ||
type='FFN', | ||
embed_dims=256, | ||
feedforward_channels=1024, | ||
num_fcs=2, | ||
ffn_drop=0.0, | ||
act_cfg=dict(type='ReLU', inplace=True)), | ||
operation_order=('self_attn', 'norm', 'ffn', 'norm')), | ||
init_cfg=None), | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=128, normalize=True), | ||
init_cfg=None), | ||
enforce_decoder_input_project=False, | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=128, normalize=True), | ||
transformer_decoder=dict( | ||
type='DetrTransformerDecoder', | ||
return_intermediate=True, | ||
num_layers=9, | ||
transformerlayers=dict( | ||
type='DetrTransformerDecoderLayer', | ||
attn_cfgs=dict( | ||
type='MultiheadAttention', | ||
embed_dims=256, | ||
num_heads=8, | ||
attn_drop=0.0, | ||
proj_drop=0.0, | ||
dropout_layer=None, | ||
batch_first=False), | ||
ffn_cfgs=dict( | ||
embed_dims=256, | ||
feedforward_channels=2048, | ||
num_fcs=2, | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
ffn_drop=0.0, | ||
dropout_layer=None, | ||
add_identity=True), | ||
feedforward_channels=2048, | ||
operation_order=('cross_attn', 'norm', 'self_attn', 'norm', | ||
'ffn', 'norm')), | ||
init_cfg=None), | ||
loss_cls=dict( | ||
type='CrossEntropyLoss', | ||
use_sigmoid=False, | ||
loss_weight=2.0, | ||
reduction='mean', | ||
class_weight=[1.0] * num_classes + [0.1]), | ||
loss_mask=dict( | ||
type='CrossEntropyLoss', | ||
use_sigmoid=True, | ||
reduction='mean', | ||
loss_weight=5.0), | ||
loss_dice=dict( | ||
type='DiceLoss', | ||
use_sigmoid=True, | ||
activate=True, | ||
reduction='mean', | ||
naive_dice=True, | ||
eps=1.0, | ||
loss_weight=5.0)), | ||
panoptic_fusion_head=dict( | ||
type='MaskFormerFusionHead', | ||
num_things_classes=num_things_classes, | ||
num_stuff_classes=num_stuff_classes, | ||
loss_panoptic=None, | ||
init_cfg=None), | ||
train_cfg=dict( | ||
num_points=12544, | ||
oversample_ratio=3.0, | ||
importance_sample_ratio=0.75, | ||
assigner=dict( | ||
type='MaskHungarianAssigner', | ||
cls_cost=dict(type='ClassificationCost', weight=2.0), | ||
mask_cost=dict( | ||
type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), | ||
dice_cost=dict( | ||
type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), | ||
sampler=dict(type='MaskPseudoSampler')), | ||
test_cfg=dict( | ||
panoptic_on=True, | ||
# For now, the dataset does not support | ||
# evaluating semantic segmentation metric. | ||
semantic_on=False, | ||
instance_on=True, | ||
# max_per_image is for instance segmentation. | ||
max_per_image=100, | ||
iou_thr=0.8, | ||
# In Mask2Former's panoptic postprocessing, | ||
# it will filter mask area where score is less than 0.5 . | ||
filter_low_score=True), | ||
init_cfg=None) | ||
|
||
# dataset settings | ||
image_size = (1024, 1024) | ||
img_norm_cfg = dict( | ||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) | ||
train_pipeline = [ | ||
dict(type='LoadImageFromFile', to_float32=True), | ||
dict( | ||
type='LoadPanopticAnnotations', | ||
with_bbox=True, | ||
with_mask=True, | ||
with_seg=True), | ||
dict(type='RandomFlip', flip_ratio=0.5), | ||
# large scale jittering | ||
dict( | ||
type='Resize', | ||
img_scale=image_size, | ||
ratio_range=(0.1, 2.0), | ||
multiscale_mode='range', | ||
keep_ratio=True), | ||
dict( | ||
type='RandomCrop', | ||
crop_size=image_size, | ||
crop_type='absolute', | ||
recompute_bbox=True, | ||
allow_negative_crop=True), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='Pad', size=image_size), | ||
dict(type='DefaultFormatBundle', img_to_float=True), | ||
dict( | ||
type='Collect', | ||
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='MultiScaleFlipAug', | ||
img_scale=(1333, 800), | ||
flip=False, | ||
transforms=[ | ||
dict(type='Resize', keep_ratio=True), | ||
dict(type='RandomFlip'), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='Pad', size_divisor=32), | ||
dict(type='ImageToTensor', keys=['img']), | ||
dict(type='Collect', keys=['img']), | ||
]) | ||
] | ||
data_root = 'data/coco/' | ||
data = dict( | ||
samples_per_gpu=2, | ||
workers_per_gpu=2, | ||
train=dict(pipeline=train_pipeline), | ||
val=dict( | ||
pipeline=test_pipeline, | ||
ins_ann_file=data_root + 'annotations/instances_val2017.json', | ||
), | ||
test=dict( | ||
pipeline=test_pipeline, | ||
ins_ann_file=data_root + 'annotations/instances_val2017.json', | ||
)) | ||
|
||
embed_multi = dict(lr_mult=1.0, decay_mult=0.0) | ||
# optimizer | ||
optimizer = dict( | ||
type='AdamW', | ||
lr=0.0001, | ||
weight_decay=0.05, | ||
eps=1e-8, | ||
betas=(0.9, 0.999), | ||
paramwise_cfg=dict( | ||
custom_keys={ | ||
'backbone': dict(lr_mult=0.1, decay_mult=1.0), | ||
'query_embed': embed_multi, | ||
'query_feat': embed_multi, | ||
'level_embed': embed_multi, | ||
}, | ||
norm_decay_mult=0.0)) | ||
optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) | ||
|
||
# learning policy | ||
lr_config = dict( | ||
policy='step', | ||
gamma=0.1, | ||
by_epoch=False, | ||
step=[327778, 355092], | ||
warmup='linear', | ||
warmup_by_epoch=False, | ||
warmup_ratio=1.0, # no warmup | ||
warmup_iters=10) | ||
|
||
max_iters = 368750 | ||
runner = dict(type='IterBasedRunner', max_iters=max_iters) | ||
|
||
log_config = dict( | ||
interval=50, | ||
hooks=[ | ||
dict(type='TextLoggerHook', by_epoch=False), | ||
dict(type='TensorboardLoggerHook', by_epoch=False) | ||
]) | ||
interval = 200000 | ||
workflow = [('train', interval)] | ||
checkpoint_config = dict( | ||
by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) | ||
|
||
# Before 365001th iteration, we do evaluation every 200000 iterations. | ||
# After 365000th iteration, we do evaluation every 368750 iterations, | ||
# which means do evaluation at the end of training. | ||
# In all, we do evaluation at the 200000th iteration and the | ||
# last iteratoin. | ||
dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] | ||
evaluation = dict( | ||
interval=interval, dynamic_intervals=dynamic_intervals, metric='PQ') |
62 changes: 62 additions & 0 deletions
62
configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] | ||
pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa | ||
|
||
depths = [2, 2, 6, 2] | ||
model = dict( | ||
type='Mask2Former', | ||
backbone=dict( | ||
_delete_=True, | ||
type='SwinTransformer', | ||
embed_dims=96, | ||
depths=depths, | ||
num_heads=[3, 6, 12, 24], | ||
window_size=7, | ||
mlp_ratio=4, | ||
qkv_bias=True, | ||
qk_scale=None, | ||
drop_rate=0., | ||
attn_drop_rate=0., | ||
drop_path_rate=0.3, | ||
patch_norm=True, | ||
out_indices=(0, 1, 2, 3), | ||
with_cp=False, | ||
convert_weights=True, | ||
frozen_stages=-1, | ||
init_cfg=dict(type='Pretrained', checkpoint=pretrained)), | ||
panoptic_head=dict( | ||
type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), | ||
init_cfg=None) | ||
|
||
# set all layers in backbone to lr_mult=0.1 | ||
# set all norm layers, position_embeding, | ||
# query_embeding, level_embeding to decay_multi=0.0 | ||
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) | ||
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) | ||
embed_multi = dict(lr_mult=1.0, decay_mult=0.0) | ||
custom_keys = { | ||
'backbone': dict(lr_mult=0.1, decay_mult=1.0), | ||
'backbone.patch_embed.norm': backbone_norm_multi, | ||
'backbone.norm': backbone_norm_multi, | ||
'absolute_pos_embed': backbone_embed_multi, | ||
'relative_position_bias_table': backbone_embed_multi, | ||
'query_embed': embed_multi, | ||
'query_feat': embed_multi, | ||
'level_embed': embed_multi | ||
} | ||
custom_keys.update({ | ||
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi | ||
for stage_id, num_blocks in enumerate(depths) | ||
for block_id in range(num_blocks) | ||
}) | ||
custom_keys.update({ | ||
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi | ||
for stage_id in range(len(depths) - 1) | ||
}) | ||
# optimizer | ||
optimizer = dict( | ||
type='AdamW', | ||
lr=0.0001, | ||
weight_decay=0.05, | ||
eps=1e-8, | ||
betas=(0.9, 0.999), | ||
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from .builder import build_match_cost | ||
from .match_cost import (BBoxL1Cost, ClassificationCost, DiceCost, | ||
FocalLossCost, IoUCost) | ||
from .match_cost import (BBoxL1Cost, ClassificationCost, CrossEntropyLossCost, | ||
DiceCost, FocalLossCost, IoUCost) | ||
|
||
__all__ = [ | ||
'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost', | ||
'FocalLossCost', 'DiceCost' | ||
'FocalLossCost', 'DiceCost', 'CrossEntropyLossCost' | ||
] |
Oops, something went wrong.